From b3f9f5d787f564cc86dbddaf75997f805a9cc90a Mon Sep 17 00:00:00 2001
From: KlemenSkrlj <47853619+klemen1999@users.noreply.github.com>
Date: Tue, 17 Sep 2024 09:14:40 +0200
Subject: [PATCH 1/2] [Fix] Corrected config valid sequence for predefined
 models (#72)

---
 luxonis_train/utils/config.py | 42 +++++++++++++++++------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/luxonis_train/utils/config.py b/luxonis_train/utils/config.py
index bdcd13dc..31e4fe5b 100644
--- a/luxonis_train/utils/config.py
+++ b/luxonis_train/utils/config.py
@@ -77,27 +77,6 @@ class ModelConfig(BaseModelExtraForbid):
     visualizers: list[AttachedModuleConfig] = []
     outputs: list[str] = []
 
-    @model_validator(mode="after")
-    def check_main_metric(self) -> Self:
-        for metric in self.metrics:
-            if metric.is_main_metric:
-                logger.info(f"Main metric: `{metric.name}`")
-                return self
-
-        logger.warning("No main metric specified.")
-        if self.metrics:
-            metric = self.metrics[0]
-            metric.is_main_metric = True
-            name = metric.alias or metric.name
-            logger.info(f"Setting '{name}' as main metric.")
-        else:
-            logger.error(
-                "No metrics specified. "
-                "This is likely unintended unless "
-                "the configuration is not used for training."
-            )
-        return self
-
     @model_validator(mode="after")
     def check_predefined_model(self) -> Self:
         from luxonis_train.utils.registry import MODELS
@@ -120,6 +99,27 @@ def check_predefined_model(self) -> Self:
 
         return self
 
+    @model_validator(mode="after")
+    def check_main_metric(self) -> Self:
+        for metric in self.metrics:
+            if metric.is_main_metric:
+                logger.info(f"Main metric: `{metric.name}`")
+                return self
+
+        logger.warning("No main metric specified.")
+        if self.metrics:
+            metric = self.metrics[0]
+            metric.is_main_metric = True
+            name = metric.alias or metric.name
+            logger.info(f"Setting '{name}' as main metric.")
+        else:
+            logger.error(
+                "No metrics specified. "
+                "This is likely unintended unless "
+                "the configuration is not used for training."
+            )
+        return self
+
     @model_validator(mode="after")
     def check_graph(self) -> Self:
         from luxonis_train.utils.general import is_acyclic

From 7daabdc60552ec9068085e15243496568cbc5492 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Kozlovsk=C3=BD?= <martin.kozlovsky@luxonis.com>
Date: Thu, 19 Sep 2024 01:05:59 -0400
Subject: [PATCH 2/2] Code Cleanup and Improved Tests (#69)

Co-authored-by: GitHub Actions <actions@github.com>
---
 .github/CODEOWNERS                            |   1 +
 .github/labeler.yaml                          |  32 +
 .github/workflows/ci.yaml                     | 175 ++++
 .github/workflows/docs.yaml                   |  26 -
 .github/workflows/pre-commit.yaml             |  13 -
 .github/workflows/tests.yaml                  | 126 ---
 .gitignore                                    |   2 +-
 .pre-commit-config.yaml                       |   6 +-
 CONTRIBUTING.md                               | 103 ++-
 configs/README.md                             |   6 +-
 configs/classification_model.yaml             |   4 +-
 configs/coco_model.yaml                       |  10 +-
 configs/detection_model.yaml                  |   4 +-
 configs/efficient_coco_model.yaml             |  10 +-
 configs/example_export.yaml                   |   4 +-
 configs/example_multi_input.yaml              |   4 +-
 configs/example_tuning.yaml                   |   2 +-
 configs/keypoint_bbox_model.yaml              |   4 +-
 configs/resnet_model.yaml                     |   4 +-
 configs/segmentation_model.yaml               |   4 +-
 luxonis_train/__init__.py                     |   8 +-
 luxonis_train/__main__.py                     |  33 +-
 .../{utils => }/assigners/__init__.py         |   0
 .../{utils => }/assigners/atts_assigner.py    |  84 +-
 .../{utils => }/assigners/tal_assigner.py     |  59 +-
 luxonis_train/{utils => }/assigners/utils.py  |  23 +-
 .../attached_modules/base_attached_module.py  | 212 +++--
 .../losses/adaptive_detection_loss.py         | 220 +++--
 .../attached_modules/losses/base_loss.py      |  22 +-
 .../losses/bce_with_logits.py                 |  63 +-
 .../attached_modules/losses/cross_entropy.py  |  15 +-
 .../losses/efficient_keypoint_bbox_loss.py    | 293 +++---
 .../losses/implicit_keypoint_bbox_loss.py     | 136 +--
 .../attached_modules/losses/keypoint_loss.py  |  95 +-
 .../losses/sigmoid_focal_loss.py              |  10 +-
 .../losses/smooth_bce_with_logits.py          |  65 +-
 .../losses/softmax_focal_loss.py              |  28 +-
 .../attached_modules/metrics/__init__.py      |   2 +-
 .../attached_modules/metrics/base_metric.py   |  19 +-
 .../attached_modules/metrics/common.py        |  92 --
 .../metrics/mean_average_precision.py         |  42 +-
 .../mean_average_precision_keypoints.py       | 112 ++-
 .../metrics/object_keypoint_similarity.py     | 171 ++--
 .../attached_modules/metrics/torchmetrics.py  | 114 +++
 .../visualizers/base_visualizer.py            |  18 +-
 .../visualizers/bbox_visualizer.py            |  60 +-
 .../visualizers/classification_visualizer.py  |  28 +-
 .../visualizers/keypoint_visualizer.py        |  24 +-
 .../visualizers/multi_visualizer.py           |  19 +-
 .../visualizers/segmentation_visualizer.py    |  17 +-
 .../attached_modules/visualizers/utils.py     |  36 +-
 luxonis_train/callbacks/__init__.py           |   8 +
 .../callbacks/archive_on_train_end.py         |   4 +-
 .../callbacks/export_on_train_end.py          |   2 +-
 luxonis_train/callbacks/gpu_stats_monitor.py  | 109 +--
 .../callbacks/luxonis_progress_bar.py         |  35 +-
 luxonis_train/callbacks/metadata_logger.py    |  37 +-
 luxonis_train/callbacks/module_freezer.py     |   3 +-
 luxonis_train/callbacks/needs_checkpoint.py   |   7 +-
 luxonis_train/callbacks/test_on_train_end.py  |   4 +-
 luxonis_train/callbacks/upload_checkpoint.py  |   7 +-
 luxonis_train/core/core.py                    | 234 +++--
 luxonis_train/core/utils/archive_utils.py     |  33 +-
 luxonis_train/core/utils/export_utils.py      |   8 +-
 luxonis_train/core/utils/train_utils.py       |  23 +-
 luxonis_train/core/utils/tune_utils.py        |  12 +-
 luxonis_train/{utils => }/loaders/__init__.py |   0
 .../{utils => }/loaders/base_loader.py        |  97 +-
 .../loaders/luxonis_loader_torch.py           |   4 +-
 luxonis_train/models/luxonis_lightning.py     | 223 +++--
 luxonis_train/models/luxonis_output.py        |   3 +-
 .../base_predefined_model.py                  |  26 +-
 .../predefined_models/classification_model.py |   4 +-
 .../predefined_models/detection_model.py      |   6 +-
 .../keypoint_detection_model.py               |   6 +-
 .../predefined_models/segmentation_model.py   |   2 +-
 luxonis_train/nodes/README.md                 |  10 +-
 luxonis_train/nodes/activations/__init__.py   |   4 +-
 .../nodes/activations/activations.py          |  11 -
 .../nodes/backbones/contextspatial.py         |  97 +-
 luxonis_train/nodes/backbones/efficientnet.py |  48 +-
 .../nodes/backbones/efficientrep/__init__.py  |   3 +
 .../{ => efficientrep}/efficientrep.py        |  98 +-
 .../nodes/backbones/efficientrep/variants.py  |  44 +
 luxonis_train/nodes/backbones/micronet.py     | 842 ------------------
 .../nodes/backbones/micronet/__init__.py      |   3 +
 .../nodes/backbones/micronet/blocks.py        | 515 +++++++++++
 .../nodes/backbones/micronet/micronet.py      |  62 ++
 .../nodes/backbones/micronet/variants.py      | 344 +++++++
 luxonis_train/nodes/backbones/mobilenetv2.py  |  57 +-
 .../nodes/backbones/mobileone/__init__.py     |   3 +
 .../{mobileone.py => mobileone/blocks.py}     | 214 +----
 .../nodes/backbones/mobileone/mobileone.py    | 197 ++++
 .../nodes/backbones/mobileone/variants.py     |  39 +
 luxonis_train/nodes/backbones/repvgg.py       | 149 ----
 .../nodes/backbones/repvgg/__init__.py        |   3 +
 .../nodes/backbones/repvgg/repvgg.py          | 135 +++
 .../nodes/backbones/repvgg/variants.py        |  31 +
 luxonis_train/nodes/backbones/resnet.py       | 128 ++-
 luxonis_train/nodes/backbones/rexnetv1.py     | 102 ++-
 luxonis_train/nodes/base_node.py              | 330 ++++---
 luxonis_train/nodes/blocks/blocks.py          |  86 +-
 luxonis_train/nodes/heads/bisenet_head.py     |  50 +-
 .../nodes/heads/classification_head.py        |  15 +-
 .../nodes/heads/efficient_bbox_head.py        |  72 +-
 .../heads/efficient_keypoint_bbox_head.py     |  52 +-
 .../heads/implicit_keypoint_bbox_head.py      | 103 ++-
 .../nodes/heads/segmentation_head.py          |  32 +-
 luxonis_train/nodes/necks/reppan_neck.py      | 148 +--
 luxonis_train/optimizers/__init__.py          |   1 +
 .../{utils => optimizers}/optimizers.py       |   2 +-
 luxonis_train/schedulers/__init__.py          |   1 +
 .../{utils => schedulers}/schedulers.py       |   0
 luxonis_train/utils/__init__.py               |  57 +-
 .../utils/{boxutils.py => boundingbox.py}     | 161 ++--
 luxonis_train/utils/config.py                 |  72 +-
 luxonis_train/utils/dataset_metadata.py       | 154 ++++
 luxonis_train/utils/exceptions.py             |  12 +
 luxonis_train/utils/general.py                | 345 +++----
 luxonis_train/utils/graph.py                  |  92 ++
 luxonis_train/utils/keypoints.py              |  85 ++
 luxonis_train/utils/registry.py               |  26 +-
 luxonis_train/utils/tracker.py                |   9 +-
 luxonis_train/utils/types.py                  |  44 +-
 media/coverage_badge.svg                      |   6 +-
 pyproject.toml                                |  48 +-
 requirements-dev.txt                          |   2 +
 tests/__init__.py                             |   0
 tests/configs/archive_config.yaml             |  43 +
 tests/configs/parking_lot_config.yaml         |  81 +-
 tests/configs/segmentation_parse_loader.yaml  |   4 +-
 tests/conftest.py                             |  18 +
 tests/integration/__init__.py                 |   0
 tests/integration/conftest.py                 | 104 ++-
 tests/integration/multi_input_modules.py      |  22 +-
 tests/integration/parking_lot.json            |  65 +-
 tests/integration/test_detection.py           |  95 ++
 tests/integration/test_sanity.py              | 136 ---
 tests/integration/test_segmentation.py        | 134 +++
 tests/integration/test_simple.py              | 215 +++++
 tests/unittests/__init__.py                   |   2 -
 tests/unittests/test_assigners/__init__.py    |   0
 .../test_assigners/test_atts_assigner.py      |  21 +-
 .../test_assigners/test_tal_assigner.py       | 135 +++
 .../test_assigners/test_utils.py              |   2 +-
 tests/unittests/test_base_attached_module.py  | 153 ++++
 tests/unittests/test_base_node.py             | 160 ++++
 tests/unittests/test_blocks.py                |  15 +
 tests/unittests/test_callbacks/__init__.py    |   0
 .../test_callbacks/test_needs_checkpoint.py   |   6 +
 tests/unittests/test_loaders/__init__.py      |   0
 .../test_loaders/test_base_loader.py          |  94 ++
 .../test_losses/test_bce_with_logits_loss.py  |  10 +-
 .../test_metrics/test_torchmetrics.py         |  52 ++
 .../test_assigners/test_tal_assigner.py       | 165 ----
 tests/unittests/test_utils/test_boxutils.py   |  79 +-
 .../test_utils/test_dataset_metadata.py       |  53 ++
 tests/unittests/test_utils/test_general.py    |  44 +
 tests/unittests/test_utils/test_graph.py      |  79 ++
 tests/unittests/test_utils/test_keypoints.py  |  24 +
 .../test_loaders/test_base_loader.py          |  69 --
 161 files changed, 6539 insertions(+), 4242 deletions(-)
 create mode 100644 .github/CODEOWNERS
 create mode 100644 .github/labeler.yaml
 create mode 100644 .github/workflows/ci.yaml
 delete mode 100644 .github/workflows/docs.yaml
 delete mode 100644 .github/workflows/pre-commit.yaml
 delete mode 100644 .github/workflows/tests.yaml
 rename luxonis_train/{utils => }/assigners/__init__.py (100%)
 rename luxonis_train/{utils => }/assigners/atts_assigner.py (84%)
 rename luxonis_train/{utils => }/assigners/tal_assigner.py (87%)
 rename luxonis_train/{utils => }/assigners/utils.py (88%)
 delete mode 100644 luxonis_train/attached_modules/metrics/common.py
 create mode 100644 luxonis_train/attached_modules/metrics/torchmetrics.py
 rename luxonis_train/{utils => }/loaders/__init__.py (100%)
 rename luxonis_train/{utils => }/loaders/base_loader.py (65%)
 rename luxonis_train/{utils => }/loaders/luxonis_loader_torch.py (98%)
 create mode 100644 luxonis_train/nodes/backbones/efficientrep/__init__.py
 rename luxonis_train/nodes/backbones/{ => efficientrep}/efficientrep.py (53%)
 create mode 100644 luxonis_train/nodes/backbones/efficientrep/variants.py
 delete mode 100644 luxonis_train/nodes/backbones/micronet.py
 create mode 100644 luxonis_train/nodes/backbones/micronet/__init__.py
 create mode 100644 luxonis_train/nodes/backbones/micronet/blocks.py
 create mode 100644 luxonis_train/nodes/backbones/micronet/micronet.py
 create mode 100644 luxonis_train/nodes/backbones/micronet/variants.py
 create mode 100644 luxonis_train/nodes/backbones/mobileone/__init__.py
 rename luxonis_train/nodes/backbones/{mobileone.py => mobileone/blocks.py} (55%)
 create mode 100644 luxonis_train/nodes/backbones/mobileone/mobileone.py
 create mode 100644 luxonis_train/nodes/backbones/mobileone/variants.py
 delete mode 100644 luxonis_train/nodes/backbones/repvgg.py
 create mode 100644 luxonis_train/nodes/backbones/repvgg/__init__.py
 create mode 100644 luxonis_train/nodes/backbones/repvgg/repvgg.py
 create mode 100644 luxonis_train/nodes/backbones/repvgg/variants.py
 create mode 100644 luxonis_train/optimizers/__init__.py
 rename luxonis_train/{utils => optimizers}/optimizers.py (92%)
 create mode 100644 luxonis_train/schedulers/__init__.py
 rename luxonis_train/{utils => schedulers}/schedulers.py (100%)
 rename luxonis_train/utils/{boxutils.py => boundingbox.py} (87%)
 create mode 100644 luxonis_train/utils/dataset_metadata.py
 create mode 100644 luxonis_train/utils/exceptions.py
 create mode 100644 luxonis_train/utils/graph.py
 create mode 100644 luxonis_train/utils/keypoints.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/configs/archive_config.yaml
 create mode 100644 tests/conftest.py
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/integration/test_detection.py
 delete mode 100644 tests/integration/test_sanity.py
 create mode 100644 tests/integration/test_segmentation.py
 create mode 100644 tests/integration/test_simple.py
 create mode 100644 tests/unittests/test_assigners/__init__.py
 rename tests/unittests/{test_utils => }/test_assigners/test_atts_assigner.py (88%)
 create mode 100644 tests/unittests/test_assigners/test_tal_assigner.py
 rename tests/unittests/{test_utils => }/test_assigners/test_utils.py (96%)
 create mode 100644 tests/unittests/test_base_attached_module.py
 create mode 100644 tests/unittests/test_base_node.py
 create mode 100644 tests/unittests/test_blocks.py
 create mode 100644 tests/unittests/test_callbacks/__init__.py
 create mode 100644 tests/unittests/test_callbacks/test_needs_checkpoint.py
 create mode 100644 tests/unittests/test_loaders/__init__.py
 create mode 100644 tests/unittests/test_loaders/test_base_loader.py
 create mode 100644 tests/unittests/test_metrics/test_torchmetrics.py
 delete mode 100644 tests/unittests/test_utils/test_assigners/test_tal_assigner.py
 create mode 100644 tests/unittests/test_utils/test_dataset_metadata.py
 create mode 100644 tests/unittests/test_utils/test_general.py
 create mode 100644 tests/unittests/test_utils/test_graph.py
 create mode 100644 tests/unittests/test_utils/test_keypoints.py
 delete mode 100644 tests/unittests/test_utils/test_loaders/test_base_loader.py

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 00000000..a6eef919
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+* @luxonis/ML-Reviewers
diff --git a/.github/labeler.yaml b/.github/labeler.yaml
new file mode 100644
index 00000000..33749bd5
--- /dev/null
+++ b/.github/labeler.yaml
@@ -0,0 +1,32 @@
+tests:
+  - changed-files:
+    - any-glob-to-any-file: 'tests/*'
+  - head-branch:
+    - 'test/*'
+    - 'tests/*'
+
+DevOps:
+  - changed-files:
+    - any-glob-to-any-file: '.github/*'
+
+CLI:
+  - changed-files:
+    - any-glob-to-any-file: '**/__main__.py'
+
+release:
+  - base-branch: 'main'
+
+enhancement:
+  - head-branch:
+    - 'feature/*'
+    - 'feat/*'
+    - 'enhancement/*'
+
+fix:
+  - head-branch:
+    - 'fix/*'
+    - 'bug/*'
+    - 'hotfix/*'
+    - 'issue/*'
+    - 'bugfix/*'
+    - 'patch/*'
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..6dbf1a87
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,175 @@
+name: CI
+
+on:
+  pull_request:
+    branches: [ dev, main ]
+    paths:
+      - 'luxonis_train/**'
+      - 'tests/**'
+      - .github/workflows/ci.yaml
+      - '!**/*.md'
+      - '!luxonis_train/__main__.py'
+
+permissions:
+  pull-requests: write
+  contents: write
+  checks: write
+
+jobs:
+  assigner:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Auto-assign
+        uses: toshimaru/auto-author-assign@v2.1.1
+
+  labeler:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+
+      - name: Labeler
+        uses: actions/labeler@v5
+        with:
+          configuration-path: .github/labeler.yaml
+
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+
+    - name: Run pre-commit
+      uses: pre-commit/action@v3.0.1
+
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.head_ref }}
+
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y pandoc
+          pip install pydoctor
+          curl -L "https://raw.githubusercontent.com/luxonis/python-api-analyzer-to-json/main/gen-docs.py" -o "gen-docs.py"
+
+      - name: Build docs
+        run: python gen-docs.py luxonis_train
+
+  type-check:
+    needs:
+      - pre-commit
+      - docs
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+        cache: pip
+
+    - name: Install dependencies
+      run: pip install -e .[dev]
+
+    - name: Type check
+      uses: jakebailey/pyright-action@v2
+      with:
+        version: '1.1.380'
+        level: warning
+        warnings: true
+        python-version: '3.10'
+        project: pyproject.toml
+
+  tests:
+    needs:
+      - type-check
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+        cache: pip
+
+    - name: Install dependencies
+      run: pip install -e .[dev]
+
+    - name: Authenticate to Google Cloud
+      id: google-auth
+      uses: google-github-actions/auth@v2
+      with:
+        credentials_json: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
+        create_credentials_file: true
+        export_environment_variables: true
+        token_format: access_token
+
+    - name: Run pytest
+      uses: pavelzw/pytest-action@v2
+      env:
+        LUXONISML_BUCKET: luxonis-test-bucket
+        PYTORCH_MPS_HIGH_WATERMARK_RATIO: 0.0
+      with:
+        emoji: false
+        custom-arguments: --junit-xml pytest.xml --cov luxonis_train --cov-report xml
+
+    - name: Create Test Report
+      uses: EnricoMi/publish-unit-test-result-action@v2
+      if: matrix.os == 'ubuntu-latest'
+      with:
+        files: pytest.xml
+
+    - name: Generate coverage badge
+      uses: tj-actions/coverage-badge-py@v2
+      if: matrix.os == 'ubuntu-latest'
+      with:
+        output: media/coverage_badge.svg
+
+    - name: Generate coverage report
+      uses: orgoro/coverage@v3.2
+      if: matrix.os == 'ubuntu-latest'
+      with:
+        coverageFile: coverage.xml
+        token: ${{ secrets.GITHUB_TOKEN }}
+        thresholdAll: 0.9
+        thresholdNew: 0.8
+
+    - name: Commit coverage badge
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        git config --global user.name 'GitHub Actions'
+        git config --global user.email 'actions@github.com'
+        git diff --quiet media/coverage_badge.svg || {
+          git add media/coverage_badge.svg
+          git commit -m "[Automated] Updated coverage badge"
+        }
+
+    - name: Push changes
+      uses: ad-m/github-push-action@master
+      if: matrix.os == 'ubuntu-latest'
+      with:
+        branch: ${{ github.head_ref }}
+
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
deleted file mode 100644
index f3c69761..00000000
--- a/.github/workflows/docs.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Docs
-
-on:
-  pull_request:
-    branches: [ dev, main ]
-    paths:
-      - 'luxonis_train/**'
-      - .github/workflows/docs.yaml
-
-jobs:
-  docs:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-
-      - name: Install dependencies
-        run: |
-          pip install pydoctor
-          curl -L "https://raw.githubusercontent.com/luxonis/python-api-analyzer-to-json/main/gen-docs.py" -o "gen-docs.py"
-
-      - name: Build docs
-        run: |
-          python gen-docs.py luxonis_train
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
deleted file mode 100644
index ce6b816b..00000000
--- a/.github/workflows/pre-commit.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: pre-commit
-
-on:
-  pull_request:
-    branches: [dev, main]
-
-jobs:
-  pre-commit:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v3
-    - uses: actions/setup-python@v3
-    - uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
deleted file mode 100644
index a0999d9b..00000000
--- a/.github/workflows/tests.yaml
+++ /dev/null
@@ -1,126 +0,0 @@
-name: Tests
-
-on:
-  pull_request:
-    branches: [ dev, main ]
-    paths:
-      - 'luxonis_train/**/**.py'
-      - 'tests/**/**.py'
-      - .github/workflows/tests.yaml
-
-jobs:
-  run_tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, windows-latest]
-        version: ['3.10']
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.head_ref }}
-
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.version }}
-        cache: pip
-
-    - name: Install dependencies [Ubuntu]
-      if: matrix.os == 'ubuntu-latest'
-      run: |
-        sudo apt update
-        sudo apt install -y pandoc
-        pip install -e .[dev]
-
-    - name: Install dependencies [Windows]
-      if: matrix.os == 'windows-latest'
-      run: pip install -e .[dev]
-
-    - name: Install dependencies [macOS]
-      if: matrix.os == 'macOS-latest'
-      run: pip install -e .[dev]
-
-    - name: Authenticate to Google Cloud
-      id: google-auth
-      uses: google-github-actions/auth@v2
-      with:
-        credentials_json: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
-        create_credentials_file: true
-        export_environment_variables: true
-        token_format: access_token
-
-    - name: Run tests with coverage [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
-      run: pytest tests --cov=luxonis_train --cov-report xml --junit-xml pytest.xml
-
-    - name: Run tests [Windows, macOS]
-      env:
-        PYTORCH_MPS_HIGH_WATERMARK_RATIO: 0.0
-      if: matrix.os != 'ubuntu-latest' || matrix.version != '3.10'
-      run: pytest tests --junit-xml pytest.xml
-
-    - name: Generate coverage badge [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
-      run: coverage-badge -o media/coverage_badge.svg -f
-
-    - name: Generate coverage report [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
-      uses: orgoro/coverage@v3.1
-      with:
-        coverageFile: coverage.xml
-        token: ${{ secrets.GITHUB_TOKEN }}
-
-    - name: Commit coverage badge [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
-      run: |
-        git config --global user.name 'GitHub Actions'
-        git config --global user.email 'actions@github.com'
-        git diff --quiet media/coverage_badge.svg || {
-          git add media/coverage_badge.svg
-          git commit -m "[Automated] Updated coverage badge"
-        }
-
-    - name: Push changes [Ubuntu]
-      if: matrix.os == 'ubuntu-latest' && matrix.version == '3.10'
-      uses: ad-m/github-push-action@master
-      with:
-        branch: ${{ github.head_ref }}
-
-    - name: Upload Test Results
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        name: Test Results [${{ matrix.os }}] (Python ${{ matrix.version }})
-        path: pytest.xml
-        retention-days: 10
-        if-no-files-found: error
-
-  publish-test-results:
-    name: "Publish Tests Results"
-    needs: run_tests
-    runs-on: ubuntu-latest
-    permissions:
-      checks: write
-      pull-requests: write
-    if: always()
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.head_ref }}
-
-      - name: Download Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: artifacts
-
-      - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action@v2
-        with:
-          files: "artifacts/**/*.xml"
diff --git a/.gitignore b/.gitignore
index 7f182cf4..03ba884c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -152,5 +152,5 @@ mlartifacts
 mlruns
 wandb
 tests/_data
-tests/integration/_test-output
+tests/integration/save-directory
 data
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3f95fc26..3d68c872 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,12 +1,11 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.8
+    rev: v0.6.4
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
         types_or: [python, pyi, jupyter]
       - id: ruff-format
-        args: [--line-length, '88']
         types_or: [python, pyi, jupyter]
 
   - repo: https://github.com/PyCQA/docformatter
@@ -14,7 +13,7 @@ repos:
     hooks:
       - id: docformatter
         additional_dependencies: [tomli]
-        args: [--in-place, --black, --style=epytext]
+        args: [--in-place, --style=epytext]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
@@ -28,4 +27,3 @@ repos:
       - id: mdformat
         additional_dependencies:
           - mdformat-gfm
-          - mdformat-toc
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d113518b..20fd3607 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,25 +3,45 @@
 **This guide is intended for our internal development team.**
 It outlines our workflow and standards for contributing to this project.
 
-## Table of Contents
+## Table Of Contents
 
+- [Pre-requisites](#pre-requisites)
 - [Pre-commit Hooks](#pre-commit-hooks)
 - [Documentation](#documentation)
+- [Type Checking](#type-checking)
   - [Editor Support](#editor-support)
 - [Tests](#tests)
 - [GitHub Actions](#github-actions)
 - [Making and Reviewing Changes](#making-and-reviewing-changes)
-- [Notes](#notes)
+
+## Pre-requisites
+
+Clone the repository and navigate to the root directory:
+
+```bash
+git clone git@github.com:luxonis/luxonis-train.git
+cd luxonis-train
+```
+
+Install the development dependencies by running `pip install -r requirements-dev.txt` or install the package with the `dev` extra flag:
+
+```bash
+pip install -e .[dev]
+```
+
+> \[!NOTE\]
+> This will install the package in editable mode (`-e`),
+> so you can make changes to the code and run them immediately.
 
 ## Pre-commit Hooks
 
 We use pre-commit hooks to ensure code quality and consistency:
 
-1. Install pre-commit (see [pre-commit.com](https://pre-commit.com/#install)).
+1. Install `pre-commit` (see [pre-commit.com](https://pre-commit.com/#install)).
 1. Clone the repository and run `pre-commit install` in the root directory.
-1. The pre-commit hook will now run automatically on `git commit`.
+1. The `pre-commit` hook will now run automatically on `git commit`.
    - If the hook fails, it will print an error message and abort the commit.
-   - It will also modify the files in-place to fix any issues it can.
+   - Some hooks will also modify the files in-place to fix found issues.
 
 ## Documentation
 
@@ -29,52 +49,75 @@ We use the [Epytext](https://epydoc.sourceforge.net/epytext.html) markup languag
 To verify that your documentation is formatted correctly, follow these steps:
 
 1. Download [`get-docs.py`](https://github.com/luxonis/python-api-analyzer-to-json/blob/main/gen-docs.py) script
-1. Run `python3 get-docs.py luxonis_ml` in the root directory.
+1. Run `python3 get-docs.py luxonis_train` in the root directory.
    - If the script runs successfully and produces `docs.json` file, your documentation is formatted correctly.
-   - **NOTE:** If the script fails, it might not give the specific error message. In that case, you can run
-     the script for each file individually until you find the one that is causing the error.
 
-### Editor Support
+> \[!NOTE\]
+> If the script fails, it might not give a specific error message.
+> In that case, you can run the script for each file individually
+> until you find the one that is causing the error.
+
+**Editor Support:**
 
 - **PyCharm** - built in support for generating `epytext` docstrings
-- **Visual Studie Code** - [AI Docify](https://marketplace.visualstudio.com/items?itemName=AIC.docify) extension offers support for `epytext`
+- **Visual Studio Code** - [AI Docify](https://marketplace.visualstudio.com/items?itemName=AIC.docify) extension offers support for `epytext`
 - **NeoVim** - [vim-python-docstring](https://github.com/pixelneo/vim-python-docstring) supports `epytext` style
 
+## Type Checking
+
+The codebase is type-checked using [pyright](https://github.com/microsoft/pyright) `v1.1.380`. To run type checking, use the following command in the root project directory:
+
+```bash
+pyright --warnings --level warning --pythonversion 3.10 luxonis_train
+```
+
+**Editor Support:**
+
+- **PyCharm** - [Pyright](https://plugins.jetbrains.com/plugin/24145-pyright) extension
+- **Visual Studio Code** - [Pyright](https://marketplace.visualstudio.com/items?itemName=ms-pyright.pyright) extension
+- **NeoVim** - [LSP-Config](https://github.com/neovim/nvim-lspconfig) plugin with the [pyright configuration](https://github.com/neovim/nvim-lspconfig/blob/master/doc/server_configurations.md#pyright)
+
 ## Tests
 
 We use [pytest](https://docs.pytest.org/en/stable/) for testing.
-The tests are located in the `tests` directory. You can run the tests locally with:
+The tests are located in the `tests` directory. To run the tests with coverage, use the following command:
 
 ```bash
-pytest tests --cov=luxonis_train
+pytest --cov=luxonis_train --cov-report=html
 ```
 
-This command will run all tests and print a coverage report. The coverage report
-is only informational for now, but we may enforce a minimum coverage in the future.
+This command will run all tests and generate HTML coverage report.
+
+> \[!TIP\]
+> The coverage report will be saved to `htmlcov` directory.
+> If you want to inspect the coverage in more detail, open `htmlcov/index.html` in a browser.
+
+> \[!TIP\]
+> You can choose to run only the unit-tests or only the integration tests by adding `-m unit` or `-m integration` to the `pytest` command.
 
-**If a new feature is added, a new test should be added to cover it.**
+> \[!IMPORTANT\]
+> If a new feature is added, a new test should be added to cover it.
+> The minimum overall test coverage for a PR to be merged is 90%.
+> The minimum coverage for new files is 80%.
 
 ## GitHub Actions
 
 Our GitHub Actions workflow is run when a new PR is opened.
-It first checks that the pre-commit hook passes and that the documentation builds successfully.
-The tests are run only if the pre-commit hook and documentation build pass.
-Successful tests are required for merging a PR.
 
-1. Checks and tests are run automatically when you open a pull request.
-1. For the tests to run, the [pre-commit](#pre-commit-hooks) hook must pass and
-   the [documentation](#documentation) must be built successfully.
-1. Review the GitHub Actions output if your PR fails.
-1. Fix any issues to ensure that all checks and tests pass.
+1. First, the [pre-commit](#pre-commit-hooks) hooks must pass and the [documentation](#documentation) must be built successfully.
+1. Next, the [type checking](#type-checking) is run.
+1. If all previous checks pass, the [tests](#tests) are run.
+
+> \[!TIP\]
+> Review the GitHub Actions output if your PR fails.
+
+> \[!IMPORTANT\]
+> Successful completion of all the workflow checks is required for merging a PR.
 
-## Making and Reviewing Changes
+## Making and Submitting Changes
 
 1. Make changes in a new branch.
 1. Test your changes locally.
-1. Commit (pre-commit hook will run).
-1. Push to your branch and create a pull request. Always request a review from:
-   - [Martin Kozlovský](https://github.com/kozlov721)
-   - [Matija Teršek](https://github.com/tersekmatija)
-   - [Conor Simmons](https://github.com/conorsim)
-1. Any other relevant team members can be added as reviewers as well.
+1. Commit your changes (pre-commit hooks will run).
+1. Push your branch and create a pull request.
 1. The team will review and merge your PR.
diff --git a/configs/README.md b/configs/README.md
index a85d5221..96444f66 100644
--- a/configs/README.md
+++ b/configs/README.md
@@ -147,16 +147,16 @@ Here you can change everything related to actual training of the model.
 | accumulate_grad_batches | int                                            | 1             | number of batches for gradient accumulation                                                                                                      |
 | use_weighted_sampler    | bool                                           | False         | bool if use WeightedRandomSampler for training, only works with classification tasks                                                             |
 | epochs                  | int                                            | 100           | number of training epochs                                                                                                                        |
-| num_workers             | int                                            | 2             | number of workers for data loading                                                                                                               |
+| n_workers               | int                                            | 2             | number of workers for data loading                                                                                                               |
 | train_metrics_interval  | int                                            | -1            | frequency of computing metrics on train data, -1 if don't perform                                                                                |
 | validation_interval     | int                                            | 1             | frequency of computing metrics on validation data                                                                                                |
-| num_log_images          | int                                            | 4             | maximum number of images to visualize and log                                                                                                    |
+| n_log_images            | int                                            | 4             | maximum number of images to visualize and log                                                                                                    |
 | skip_last_batch         | bool                                           | True          | whether to skip last batch while training                                                                                                        |
 | accelerator             | Literal\["auto", "cpu", "gpu"\]                | "auto"        | What accelerator to use for training.                                                                                                            |
 | devices                 | int \| list\[int\] \| str                      | "auto"        | Either specify how many devices to use (int), list specific devices, or use "auto" for automatic configuration based on the selected accelerator |
 | matmul_precision        | Literal\["medium", "high", "highest"\] \| None | None          | Sets the internal precision of float32 matrix multiplications.                                                                                   |
 | strategy                | Literal\["auto", "ddp"\]                       | "auto"        | What strategy to use for training.                                                                                                               |
-| num_sanity_val_steps    | int                                            | 2             | Number of sanity validation steps performed before training.                                                                                     |
+| n_sanity_val_steps      | int                                            | 2             | Number of sanity validation steps performed before training.                                                                                     |
 | profiler                | Literal\["simple", "advanced"\] \| None        | None          | PL profiler for GPU/CPU/RAM utilization analysis                                                                                                 |
 | verbose                 | bool                                           | True          | Print all intermediate results to console.                                                                                                       |
 
diff --git a/configs/classification_model.yaml b/configs/classification_model.yaml
index be5a5006..4db7a9b1 100644
--- a/configs/classification_model.yaml
+++ b/configs/classification_model.yaml
@@ -25,9 +25,9 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   callbacks:
     - name: ExportOnTrainEnd
diff --git a/configs/coco_model.yaml b/configs/coco_model.yaml
index 9af25feb..23516bea 100644
--- a/configs/coco_model.yaml
+++ b/configs/coco_model.yaml
@@ -7,7 +7,7 @@ model:
     - name: EfficientRep
       params:
         channels_list: [64, 128, 256, 512, 1024]
-        num_repeats: [1, 6, 12, 18, 6]
+        n_repeats: [1, 6, 12, 18, 6]
         depth_mul: 0.33
         width_mul: 0.33
 
@@ -16,7 +16,7 @@ model:
         - EfficientRep
       params:
         channels_list: [256, 128, 128, 256, 256, 512]
-        num_repeats: [12, 12, 12, 12]
+        n_repeats: [12, 12, 12, 12]
         depth_mul: 0.33
         width_mul: 0.33
 
@@ -108,16 +108,16 @@ trainer:
   devices: auto
   strategy: auto
 
-  num_sanity_val_steps: 1
+  n_sanity_val_steps: 1
   profiler: null
   verbose: True
   batch_size: 4
   accumulate_grad_batches: 1
   epochs: &epochs 200
-  num_workers: 8
+  n_workers: 8
   train_metrics_interval: -1
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
   skip_last_batch: True
   log_sub_losses: True
   save_top_k: 3
diff --git a/configs/detection_model.yaml b/configs/detection_model.yaml
index 45c3431e..7bc87eef 100644
--- a/configs/detection_model.yaml
+++ b/configs/detection_model.yaml
@@ -20,9 +20,9 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   callbacks:
     - name: ExportOnTrainEnd
diff --git a/configs/efficient_coco_model.yaml b/configs/efficient_coco_model.yaml
index 64aa48e0..f2c9db5d 100644
--- a/configs/efficient_coco_model.yaml
+++ b/configs/efficient_coco_model.yaml
@@ -5,7 +5,7 @@ model:
     - name: EfficientRep
       params:
         channels_list: [64, 128, 256, 512, 1024]
-        num_repeats: [1, 6, 12, 18, 6]
+        n_repeats: [1, 6, 12, 18, 6]
         depth_mul: 0.33
         width_mul: 0.33
 
@@ -14,7 +14,7 @@ model:
         - EfficientRep
       params:
         channels_list: [256, 128, 128, 256, 256, 512]
-        num_repeats: [12, 12, 12, 12]
+        n_repeats: [12, 12, 12, 12]
         depth_mul: 0.33
         width_mul: 0.33
 
@@ -91,14 +91,14 @@ loader:
 
 trainer:
 
-  num_sanity_val_steps: 1
+  n_sanity_val_steps: 1
   batch_size: 4
   accumulate_grad_batches: 1
   epochs: 200
-  num_workers: 4
+  n_workers: 4
   train_metrics_interval: -1
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
   save_top_k: 3
 
   preprocessing:
diff --git a/configs/example_export.yaml b/configs/example_export.yaml
index f86f1dfa..51f768dc 100644
--- a/configs/example_export.yaml
+++ b/configs/example_export.yaml
@@ -22,9 +22,9 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   optimizer:
     name: SGD
diff --git a/configs/example_multi_input.yaml b/configs/example_multi_input.yaml
index d185f37e..9632ed43 100644
--- a/configs/example_multi_input.yaml
+++ b/configs/example_multi_input.yaml
@@ -97,9 +97,9 @@ tracker:
 trainer:
   batch_size: 1
   epochs: 10
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 4
+  n_log_images: 4
 
   callbacks:
     - name: ExportOnTrainEnd
diff --git a/configs/example_tuning.yaml b/configs/example_tuning.yaml
index b350ea2f..d8c9027d 100644
--- a/configs/example_tuning.yaml
+++ b/configs/example_tuning.yaml
@@ -30,7 +30,7 @@ trainer:
   batch_size: 4
   epochs: &epochs 100
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   scheduler:
     name: CosineAnnealingLR
diff --git a/configs/keypoint_bbox_model.yaml b/configs/keypoint_bbox_model.yaml
index 5b1ebb2d..51554f73 100644
--- a/configs/keypoint_bbox_model.yaml
+++ b/configs/keypoint_bbox_model.yaml
@@ -18,9 +18,9 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   callbacks:
     - name: ExportOnTrainEnd
diff --git a/configs/resnet_model.yaml b/configs/resnet_model.yaml
index e8353870..bb9f8f62 100644
--- a/configs/resnet_model.yaml
+++ b/configs/resnet_model.yaml
@@ -36,9 +36,9 @@ loader:
 trainer:
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   preprocessing:
     train_image_size: [&height 224, &width 224]
diff --git a/configs/segmentation_model.yaml b/configs/segmentation_model.yaml
index a822d7c1..b403a75e 100644
--- a/configs/segmentation_model.yaml
+++ b/configs/segmentation_model.yaml
@@ -21,9 +21,9 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 200
-  num_workers: 4
+  n_workers: 4
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
 
   callbacks:
     - name: ExportOnTrainEnd
diff --git a/luxonis_train/__init__.py b/luxonis_train/__init__.py
index 60d8d501..ebc4a719 100644
--- a/luxonis_train/__init__.py
+++ b/luxonis_train/__init__.py
@@ -1,7 +1,11 @@
+__version__ = "0.0.1"
+
+
 from .attached_modules import *
 from .core import *
+from .loaders import *
 from .models import *
 from .nodes import *
+from .optimizers import *
+from .schedulers import *
 from .utils import *
-
-__version__ = "0.0.1"
diff --git a/luxonis_train/__main__.py b/luxonis_train/__main__.py
index 454e9525..c3164227 100644
--- a/luxonis_train/__main__.py
+++ b/luxonis_train/__main__.py
@@ -41,7 +41,9 @@ class _ViewType(str, Enum):
     ),
 ]
 
-ViewType = Annotated[_ViewType, typer.Option(help="Which dataset view to use.")]
+ViewType = Annotated[
+    _ViewType, typer.Option(help="Which dataset view to use.")
+]
 
 SaveDirType = Annotated[
     Optional[Path],
@@ -53,7 +55,8 @@ class _ViewType(str, Enum):
 def train(
     config: ConfigType = None,
     resume: Annotated[
-        Optional[str], typer.Option(help="Resume training from this checkpoint.")
+        Optional[str],
+        typer.Option(help="Resume training from this checkpoint."),
     ] = None,
     opts: OptsType = None,
 ):
@@ -65,7 +68,9 @@ def train(
 
 @app.command()
 def test(
-    config: ConfigType = None, view: ViewType = _ViewType.VAL, opts: OptsType = None
+    config: ConfigType = None,
+    view: ViewType = _ViewType.VAL,
+    opts: OptsType = None,
 ):
     """Evaluate model."""
     from luxonis_train.core import LuxonisModel
@@ -115,13 +120,26 @@ def inspect(
             case_sensitive=False,
         ),
     ] = "train",  # type: ignore
+    size_multiplier: Annotated[
+        float,
+        typer.Option(
+            ...,
+            "--size-multiplier",
+            "-s",
+            help=(
+                "Multiplier for the image size. "
+                "By default the images are shown in their original size."
+            ),
+            show_default=False,
+        ),
+    ] = 1.0,
     opts: OptsType = None,
 ):
     """Inspect dataset."""
     from lightning.pytorch import seed_everything
     from luxonis_ml.data.__main__ import inspect as lxml_inspect
 
-    from luxonis_train.utils.config import Config
+    from luxonis_train.utils import Config
 
     cfg = Config.get_config(config, opts)
     if cfg.trainer.seed is not None:
@@ -144,6 +162,7 @@ def inspect(
             name=cfg.loader.params["dataset_name"],
             view=[view],
             aug_config=f.name,
+            size_multiplier=size_multiplier,
         )
 
 
@@ -166,7 +185,7 @@ def archive(
 
 def version_callback(value: bool):
     if value:
-        typer.echo(f"LuxonisTrain Version: {version(__package__)}")
+        typer.echo(f"LuxonisTrain Version: {version('luxonis_train')}")
         raise typer.Exit()
 
 
@@ -175,7 +194,9 @@ def common(
     _: Annotated[
         bool,
         typer.Option(
-            "--version", callback=version_callback, help="Show version and exit."
+            "--version",
+            callback=version_callback,
+            help="Show version and exit.",
         ),
     ] = False,
     source: Annotated[
diff --git a/luxonis_train/utils/assigners/__init__.py b/luxonis_train/assigners/__init__.py
similarity index 100%
rename from luxonis_train/utils/assigners/__init__.py
rename to luxonis_train/assigners/__init__.py
diff --git a/luxonis_train/utils/assigners/atts_assigner.py b/luxonis_train/assigners/atts_assigner.py
similarity index 84%
rename from luxonis_train/utils/assigners/atts_assigner.py
rename to luxonis_train/assigners/atts_assigner.py
index 9a0466da..269496fa 100644
--- a/luxonis_train/utils/assigners/atts_assigner.py
+++ b/luxonis_train/assigners/atts_assigner.py
@@ -49,9 +49,10 @@ def forward(
         @type pred_bboxes: Tensor
         @param pred_bboxes: Predicted bboxes of shape [bs, n_anchors, 4]
         @rtype: tuple[Tensor, Tensor, Tensor, Tensor, Tensor]
-        @return: Assigned labels of shape [bs, n_anchors], assigned bboxes of shape [bs,
-            n_anchors, 4], assigned scores of shape [bs, n_anchors, n_classes] and
-            output positive mask of shape [bs, n_anchors].
+        @return: Assigned labels of shape [bs, n_anchors], assigned
+            bboxes of shape [bs, n_anchors, 4], assigned scores of shape
+            [bs, n_anchors, n_classes] and output positive mask of shape
+            [bs, n_anchors].
         """
 
         self.n_anchors = anchor_bboxes.size(0)
@@ -61,9 +62,13 @@ def forward(
         if self.n_max_boxes == 0:
             device = gt_bboxes.device
             return (
-                torch.full([self.bs, self.n_anchors], self.n_classes).to(device),
+                torch.full([self.bs, self.n_anchors], self.n_classes).to(
+                    device
+                ),
                 torch.zeros([self.bs, self.n_anchors, 4]).to(device),
-                torch.zeros([self.bs, self.n_anchors, self.n_classes]).to(device),
+                torch.zeros([self.bs, self.n_anchors, self.n_classes]).to(
+                    device
+                ),
                 torch.zeros([self.bs, self.n_anchors]).to(device),
                 torch.zeros([self.bs, self.n_anchors]).to(device),
             )
@@ -78,7 +83,10 @@ def forward(
         gt_centers = self._get_bbox_center(gt_bboxes_flat)
         anchor_centers = self._get_bbox_center(anchor_bboxes)
         distances = (
-            (gt_centers[:, None, :] - anchor_centers[None, :, :]).pow(2).sum(-1).sqrt()
+            (gt_centers[:, None, :] - anchor_centers[None, :, :])
+            .pow(2)
+            .sum(-1)
+            .sqrt()
         )
         distances = distances.reshape([self.bs, -1, self.n_anchors])
 
@@ -103,15 +111,18 @@ def forward(
         )
 
         # Generate final assignments based on masks
-        assigned_labels, assigned_bboxes, assigned_scores = self._get_final_assignments(
+        (
+            assigned_labels,
+            assigned_bboxes,
+            assigned_scores,
+        ) = self._get_final_assignments(
             gt_labels, gt_bboxes, assigned_gt_idx, mask_pos_sum
         )
 
         # Soft label with IoU
-        if pred_bboxes is not None:
-            ious = batch_iou(gt_bboxes, pred_bboxes) * mask_pos
-            ious = ious.max(dim=-2)[0].unsqueeze(-1)
-            assigned_scores *= ious
+        ious = batch_iou(gt_bboxes, pred_bboxes) * mask_pos
+        ious = ious.max(dim=-2)[0].unsqueeze(-1)
+        assigned_scores *= ious
 
         out_mask_positive = mask_pos_sum.bool()
 
@@ -141,12 +152,13 @@ def _select_topk_candidates(
         @type mask_gt: Tensor
         @param mask_gt: Mask for valid GT per image.
         @rtype: tuple[Tensor, Tensor]
-        @return: Mask of selected anchors and indices of selected anchors.
+        @return: Mask of selected anchors and indices of selected
+            anchors.
         """
         mask_gt = mask_gt.repeat(1, 1, self.topk).bool()
         level_distances = torch.split(distances, n_level_bboxes, dim=-1)
-        is_in_topk_list = []
-        topk_idxs = []
+        is_in_topk_list: list[Tensor] = []
+        topk_idxs: list[Tensor] = []
         start_idx = 0
         for per_level_distances, per_level_boxes in zip(
             level_distances, n_level_bboxes
@@ -158,18 +170,20 @@ def _select_topk_candidates(
             )
             topk_idxs.append(per_level_topk_idxs + start_idx)
             per_level_topk_idxs = torch.where(
-                mask_gt, per_level_topk_idxs, torch.zeros_like(per_level_topk_idxs)
+                mask_gt,
+                per_level_topk_idxs,
+                torch.zeros_like(per_level_topk_idxs),
+            )
+            is_in_topk = F.one_hot(per_level_topk_idxs, per_level_boxes).sum(
+                dim=-2
             )
-            is_in_topk = F.one_hot(per_level_topk_idxs, per_level_boxes).sum(dim=-2)
             is_in_topk = torch.where(
                 is_in_topk > 1, torch.zeros_like(is_in_topk), is_in_topk
             )
             is_in_topk_list.append(is_in_topk.to(distances.dtype))
             start_idx = end_idx
 
-        is_in_topk_list = torch.cat(is_in_topk_list, dim=-1)
-        topk_idxs = torch.cat(topk_idxs, dim=-1)
-        return is_in_topk_list, topk_idxs
+        return torch.cat(is_in_topk_list, dim=-1), torch.cat(topk_idxs, dim=-1)
 
     def _get_positive_samples(
         self,
@@ -177,14 +191,18 @@ def _get_positive_samples(
         topk_idxs: Tensor,
         overlaps: Tensor,
     ) -> Tensor:
-        """Computes threshold and returns mask for samples over threshold.
+        """Computes threshold and returns mask for samples over
+        threshold.
 
         @type is_in_topk: Tensor
-        @param is_in_topk: Mask of selected anchors [bx, n_max_boxes, n_anchors]
+        @param is_in_topk: Mask of selected anchors [bx, n_max_boxes,
+            n_anchors]
         @type topk_idxs: Tensor
-        @param topk_idxs: Indices of selected anchors [bx, n_max_boxes, topK * n_levels]
+        @param topk_idxs: Indices of selected anchors [bx, n_max_boxes,
+            topK * n_levels]
         @type overlaps: Tensor
-        @param overlaps: IoUs between GTs and anchors [bx, n_max_boxes, n_anchors]
+        @param overlaps: IoUs between GTs and anchors [bx, n_max_boxes,
+            n_anchors]
         @rtype: Tensor
         @return: Mask of positive samples [bx, n_max_boxes, n_anchors]
         """
@@ -199,14 +217,17 @@ def _get_positive_samples(
         assist_idxs = assist_idxs[:, None]
         flatten_idxs = topk_idxs + assist_idxs
         candidate_overlaps = _candidate_overlaps.reshape(-1)[flatten_idxs]
-        candidate_overlaps = candidate_overlaps.reshape([self.bs, self.n_max_boxes, -1])
+        candidate_overlaps = candidate_overlaps.reshape(
+            [self.bs, self.n_max_boxes, -1]
+        )
 
         overlaps_mean_per_gt = candidate_overlaps.mean(dim=-1, keepdim=True)
         overlaps_std_per_gt = candidate_overlaps.std(dim=-1, keepdim=True)
         overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
 
         is_pos = torch.where(
-            _candidate_overlaps > overlaps_thr_per_gt.repeat([1, 1, self.n_anchors]),
+            _candidate_overlaps
+            > overlaps_thr_per_gt.repeat([1, 1, self.n_anchors]),
             is_in_topk,
             torch.zeros_like(is_in_topk),
         )
@@ -230,15 +251,18 @@ def _get_final_assignments(
         @type mask_pos_sum: Tensor
         @param mask_pos_sum: Mask of matched GTs [bs, n_anchors]
         @rtype: tuple[Tensor, Tensor, Tensor]
-        @return: Assigned labels of shape [bs, n_anchors], assigned bboxes of shape [bs,
-            n_anchors, 4], assigned scores of shape [bs, n_anchors, n_classes].
+        @return: Assigned labels of shape [bs, n_anchors], assigned
+            bboxes of shape [bs, n_anchors, 4], assigned scores of shape
+            [bs, n_anchors, n_classes].
         """
         # assigned target labels
         batch_idx = torch.arange(
             self.bs, dtype=gt_labels.dtype, device=gt_labels.device
         )
         batch_idx = batch_idx[..., None]
-        assigned_gt_idx = (assigned_gt_idx + batch_idx * self.n_max_boxes).long()
+        assigned_gt_idx = (
+            assigned_gt_idx + batch_idx * self.n_max_boxes
+        ).long()
         assigned_labels = gt_labels.flatten()[assigned_gt_idx.flatten()]
         assigned_labels = assigned_labels.reshape([self.bs, self.n_anchors])
         assigned_labels = torch.where(
@@ -252,7 +276,9 @@ def _get_final_assignments(
         assigned_bboxes = assigned_bboxes.reshape([self.bs, self.n_anchors, 4])
 
         # assigned target scores
-        assigned_scores = F.one_hot(assigned_labels.long(), self.n_classes + 1).float()
+        assigned_scores = F.one_hot(
+            assigned_labels.long(), self.n_classes + 1
+        ).float()
         assigned_scores = assigned_scores[:, :, : self.n_classes]
 
         return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/luxonis_train/utils/assigners/tal_assigner.py b/luxonis_train/assigners/tal_assigner.py
similarity index 87%
rename from luxonis_train/utils/assigners/tal_assigner.py
rename to luxonis_train/assigners/tal_assigner.py
index 08b5b461..ea228eba 100644
--- a/luxonis_train/utils/assigners/tal_assigner.py
+++ b/luxonis_train/assigners/tal_assigner.py
@@ -66,9 +66,10 @@ def forward(
         @type mask_gt: Tensor
         @param mask_gt: Mask for valid GTs [bs, n_max_boxes, 1]
         @rtype: tuple[Tensor, Tensor, Tensor, Tensor, Tensor]
-        @return: Assigned labels of shape [bs, n_anchors], assigned bboxes of shape [bs,
-            n_anchors, 4], assigned scores of shape [bs, n_anchors, n_classes] and
-            output mask of shape [bs, n_anchors]
+        @return: Assigned labels of shape [bs, n_anchors], assigned
+            bboxes of shape [bs, n_anchors, 4], assigned scores of shape
+            [bs, n_anchors, n_classes] and output mask of shape [bs,
+            n_anchors]
         """
         self.bs = pred_scores.size(0)
         self.n_max_boxes = gt_bboxes.size(1)
@@ -76,7 +77,9 @@ def forward(
         if self.n_max_boxes == 0:
             device = gt_bboxes.device
             return (
-                torch.full_like(pred_scores[..., 0], self.n_classes).to(device),
+                torch.full_like(pred_scores[..., 0], self.n_classes).to(
+                    device
+                ),
                 torch.zeros_like(pred_bboxes).to(device),
                 torch.zeros_like(pred_scores).to(device),
                 torch.zeros_like(pred_scores[..., 0]).to(device),
@@ -105,7 +108,11 @@ def forward(
         )
 
         # Generate final targets based on masks
-        assigned_labels, assigned_bboxes, assigned_scores = self._get_final_assignments(
+        (
+            assigned_labels,
+            assigned_bboxes,
+            assigned_scores,
+        ) = self._get_final_assignments(
             gt_labels, gt_bboxes, assigned_gt_idx, mask_pos_sum
         )
 
@@ -137,7 +144,8 @@ def _get_alignment_metric(
         gt_labels: Tensor,
         gt_bboxes: Tensor,
     ):
-        """Calculates anchor alignment metric and IoU between GTs and predicted bboxes.
+        """Calculates anchor alignment metric and IoU between GTs and
+        predicted bboxes.
 
         @type pred_scores: Tensor
         @param pred_scores: Predicted scores [bs, n_anchors, 1]
@@ -151,7 +159,9 @@ def _get_alignment_metric(
         pred_scores = pred_scores.permute(0, 2, 1)
         gt_labels = gt_labels.to(torch.long)
         ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)
-        ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)
+        ind[0] = (
+            torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)
+        )
         ind[1] = gt_labels.squeeze(-1)
         bbox_scores = pred_scores[ind[0], ind[1]]
 
@@ -169,24 +179,30 @@ def _select_topk_candidates(
         """Selects k anchors based on provided metrics tensor.
 
         @type metrics: Tensor
-        @param metrics: Metrics tensor of shape [bs, n_max_boxes, n_anchors]
+        @param metrics: Metrics tensor of shape [bs, n_max_boxes,
+            n_anchors]
         @type largest: bool
-        @param largest: Flag if should keep largest topK. Defaults to True.
+        @param largest: Flag if should keep largest topK. Defaults to
+            True.
         @type topk_mask: Tensor
-        @param topk_mask: Mask for valid GTs of shape [bs, n_max_boxes, topk]
+        @param topk_mask: Mask for valid GTs of shape [bs, n_max_boxes,
+            topk]
         @rtype: Tensor
-        @return: Mask of selected anchors of shape [bs, n_max_boxes, n_anchors]
+        @return: Mask of selected anchors of shape [bs, n_max_boxes,
+            n_anchors]
         """
-        num_anchors = metrics.shape[-1]
+        n_anchors = metrics.shape[-1]
         topk_metrics, topk_idxs = torch.topk(
             metrics, self.topk, dim=-1, largest=largest
         )
         if topk_mask is None:
-            topk_mask = (topk_metrics.max(dim=-1, keepdim=True)[0] > self.eps).tile(
-                [1, 1, self.topk]
-            )
-        topk_idxs = torch.where(topk_mask, topk_idxs, torch.zeros_like(topk_idxs))
-        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(dim=-2)
+            topk_mask = (
+                topk_metrics.max(dim=-1, keepdim=True)[0] > self.eps
+            ).tile([1, 1, self.topk])
+        topk_idxs = torch.where(
+            topk_mask, topk_idxs, torch.zeros_like(topk_idxs)
+        )
+        is_in_topk = F.one_hot(topk_idxs, n_anchors).sum(dim=-2)
         is_in_topk = torch.where(
             is_in_topk > 1, torch.zeros_like(is_in_topk), is_in_topk
         )
@@ -210,8 +226,9 @@ def _get_final_assignments(
         @type mask_pos_sum: Tensor
         @param mask_pos_sum: Mask of matched GTs [bs, n_anchors]
         @rtype: tuple[Tensor, Tensor, Tensor]
-        @return: Assigned labels of shape [bs, n_anchors], assigned bboxes of shape [bs,
-            n_anchors, 4], assigned scores of shape [bs, n_anchors, n_classes].
+        @return: Assigned labels of shape [bs, n_anchors], assigned
+            bboxes of shape [bs, n_anchors, 4], assigned scores of shape
+            [bs, n_anchors, n_classes].
         """
         # assigned target labels
         batch_ind = torch.arange(
@@ -228,7 +245,9 @@ def _get_final_assignments(
         assigned_scores = F.one_hot(assigned_labels, self.n_classes)
         mask_pos_scores = mask_pos_sum[:, :, None].repeat(1, 1, self.n_classes)
         assigned_scores = torch.where(
-            mask_pos_scores > 0, assigned_scores, torch.full_like(assigned_scores, 0)
+            mask_pos_scores > 0,
+            assigned_scores,
+            torch.full_like(assigned_scores, 0),
         )
 
         assigned_labels = torch.where(
diff --git a/luxonis_train/utils/assigners/utils.py b/luxonis_train/assigners/utils.py
similarity index 88%
rename from luxonis_train/utils/assigners/utils.py
rename to luxonis_train/assigners/utils.py
index fadf5f8e..fe9fba4b 100644
--- a/luxonis_train/utils/assigners/utils.py
+++ b/luxonis_train/assigners/utils.py
@@ -2,7 +2,7 @@
 import torch.nn.functional as F
 from torch import Tensor
 
-from luxonis_train.utils.boxutils import bbox_iou
+from luxonis_train.utils import bbox_iou
 
 
 def candidates_in_gt(
@@ -20,7 +20,9 @@ def candidates_in_gt(
     @return: Mask for anchors inside any GT bbox
     """
     n_anchors = anchor_centers.size(0)
-    anchor_centers = anchor_centers.unsqueeze(0).repeat(gt_bboxes.size(0), 1, 1)
+    anchor_centers = anchor_centers.unsqueeze(0).repeat(
+        gt_bboxes.size(0), 1, 1
+    )
     gt_bboxes_lt = gt_bboxes[:, :2].unsqueeze(1).repeat(1, n_anchors, 1)
     gt_bboxes_rb = gt_bboxes[:, 2:].unsqueeze(1).repeat(1, n_anchors, 1)
     bbox_delta_lt = anchor_centers - gt_bboxes_lt
@@ -33,12 +35,15 @@ def candidates_in_gt(
 def fix_collisions(
     mask_pos: Tensor, overlaps: Tensor, n_max_boxes: int
 ) -> tuple[Tensor, Tensor, Tensor]:
-    """If an anchor is assigned to multiple GTs, the one with highest IoU is selected.
+    """If an anchor is assigned to multiple GTs, the one with highest
+    IoU is selected.
 
     @type mask_pos: Tensor
-    @param mask_pos: Mask of assigned anchors [bs, n_max_boxes, n_anchors]
+    @param mask_pos: Mask of assigned anchors [bs, n_max_boxes,
+        n_anchors]
     @type overlaps: Tensor
-    @param overlaps: IoUs between GTs and anchors [bx, n_max_boxes, n_anchors]
+    @param overlaps: IoUs between GTs and anchors [bx, n_max_boxes,
+        n_anchors]
     @type n_max_boxes: int
     @param n_max_boxes: Number of maximum boxes per image
     @rtype: tuple[Tensor, Tensor, Tensor]
@@ -46,7 +51,9 @@ def fix_collisions(
     """
     mask_pos_sum = mask_pos.sum(dim=-2)
     if mask_pos_sum.max() > 1:
-        mask_multi_gts = (mask_pos_sum.unsqueeze(1) > 1).repeat([1, n_max_boxes, 1])
+        mask_multi_gts = (mask_pos_sum.unsqueeze(1) > 1).repeat(
+            [1, n_max_boxes, 1]
+        )
         max_overlaps_idx = overlaps.argmax(dim=1)
         is_max_overlaps = F.one_hot(max_overlaps_idx, n_max_boxes)
         is_max_overlaps = is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype)
@@ -57,8 +64,8 @@ def fix_collisions(
 
 
 def batch_iou(batch1: Tensor, batch2: Tensor) -> Tensor:
-    """Calculates IoU for each pair of bboxes in the batch. Bboxes must be in xyxy
-    format.
+    """Calculates IoU for each pair of bboxes in the batch. Bboxes must
+    be in xyxy format.
 
     @type batch1: Tensor
     @param batch1: Tensor of shape C{[bs, N, 4]}
diff --git a/luxonis_train/attached_modules/base_attached_module.py b/luxonis_train/attached_modules/base_attached_module.py
index 17a4c277..904120a2 100644
--- a/luxonis_train/attached_modules/base_attached_module.py
+++ b/luxonis_train/attached_modules/base_attached_module.py
@@ -1,13 +1,15 @@
 import logging
 from abc import ABC
+from contextlib import suppress
 from typing import Generic
 
+from luxonis_ml.data import LabelType
 from luxonis_ml.utils.registry import AutoRegisterMeta
-from torch import Tensor, nn
+from torch import Size, Tensor, nn
 from typing_extensions import TypeVarTuple, Unpack
 
 from luxonis_train.nodes import BaseNode
-from luxonis_train.utils.types import IncompatibleException, Labels, LabelType, Packet
+from luxonis_train.utils import IncompatibleException, Labels, Packet
 
 logger = logging.getLogger(__name__)
 
@@ -15,7 +17,11 @@
 
 
 class BaseAttachedModule(
-    nn.Module, Generic[Unpack[Ts]], ABC, metaclass=AutoRegisterMeta, register=False
+    nn.Module,
+    Generic[Unpack[Ts]],
+    ABC,
+    metaclass=AutoRegisterMeta,
+    register=False,
 ):
     """Base class for all modules that are attached to a L{LuxonisNode}.
 
@@ -58,21 +64,38 @@ def __init__(self, *, node: BaseNode | None = None):
         self._node = node
         self._epoch = 0
 
-        self._required_labels: tuple[LabelType, ...] | None = None
-        if self._node and self.supported_labels and self.node.tasks:
+        self.required_labels: list[LabelType] = []
+        if self._node and self.supported_labels:
+            module_supported = [
+                label.value
+                if isinstance(label, LabelType)
+                else f"({' + '.join(label)})"
+                for label in self.supported_labels
+            ]
+            module_supported = f"[{', '.join(module_supported)}]"
+            if not self.node.tasks:
+                raise IncompatibleException(
+                    f"Module '{self.name}' requires one of the following "
+                    f"labels or combinations of labels: {module_supported}, "
+                    f"but is connected to node '{self.node.name}' which does not specify any tasks."
+                )
             node_tasks = set(self.node.tasks)
             for required_labels in self.supported_labels:
                 if isinstance(required_labels, LabelType):
-                    required_labels = (required_labels,)
+                    required_labels = [required_labels]
+                else:
+                    required_labels = list(required_labels)
                 if set(required_labels) <= node_tasks:
-                    self._required_labels = required_labels
+                    self.required_labels = required_labels
                     break
             else:
-                raise ValueError(
-                    f"Module {self.name} supports labels {self.supported_labels}, "
-                    f"but is connected to node {self.node.name} which does not support any of them. "
-                    f"{self.node.name} supports {list(self.node_tasks.keys())}."
+                node_supported = [task.value for task in self.node.tasks]
+                raise IncompatibleException(
+                    f"Module '{self.name}' requires one of the following labels or combinations of labels: {module_supported}, "
+                    f"but is connected to node '{self.node.name}' which does not support any of them. "
+                    f"{self.node.name} supports {node_supported}."
                 )
+        self._check_node_type_override()
 
     @property
     def name(self) -> str:
@@ -83,7 +106,8 @@ def node(self) -> BaseNode:
         """Reference to the node that this module is attached to.
 
         @type: L{BaseNode}
-        @raises RuntimeError: If the node was not provided during initialization.
+        @raises RuntimeError: If the node was not provided during
+            initialization.
         """
         if self._node is None:
             raise RuntimeError(
@@ -93,20 +117,63 @@ def node(self) -> BaseNode:
         return self._node
 
     @property
-    def required_labels(self) -> tuple[LabelType, ...]:
-        if self._required_labels is None:
-            raise ValueError(f"{self.name} does not require any labels.")
-        return self._required_labels
+    def n_keypoints(self) -> int:
+        """Getter for the number of keypoints.
+
+        @type: int
+        @raises ValueError: If the node does not support keypoints.
+        @raises RuntimeError: If the node doesn't define any task.
+        """
+        return self.node.n_keypoints
+
+    @property
+    def n_classes(self) -> int:
+        """Getter for the number of classes.
+
+        @type: int
+        @raises RuntimeError: If the node doesn't define any task.
+        @raises ValueError: If the number of classes is different for
+            different tasks. In that case, use the L{get_n_classes}
+            method.
+        """
+        return self.node.n_classes
+
+    @property
+    def original_in_shape(self) -> Size:
+        """Getter for the original input shape as [N, H, W].
+
+        @type: Size
+        """
+        return self.node.original_in_shape
+
+    @property
+    def class_names(self) -> list[str]:
+        """Getter for the class names.
+
+        @type: list[str]
+        @raises RuntimeError: If the node doesn't define any task.
+        @raises ValueError: If the class names are different for
+            different tasks. In that case, use the L{get_class_names}
+            method.
+        """
+        return self.node.class_names
 
     @property
     def node_tasks(self) -> dict[LabelType, str]:
+        """Getter for the tasks of the attached node.
+
+        @type: dict[LabelType, str]
+        @raises RuntimeError: If the node does not have the `tasks` attribute set.
+        """
         if self.node._tasks is None:
-            raise ValueError("Node must have the `tasks` attribute specified.")
+            raise RuntimeError(
+                "Node must have the `tasks` attribute specified."
+            )
         return self.node._tasks
 
     def get_label(
         self, labels: Labels, label_type: LabelType | None = None
-    ) -> tuple[Tensor, LabelType]:
+    ) -> Tensor:
         """Extracts a specific label from the labels dictionary.
 
         If the label type is not provided, the first label that matches the
@@ -114,11 +181,11 @@ def get_label(
 
         Example::
             >>> # supported_labels = [LabelType.SEGMENTATION]
-            >>> labels = {"segmentation": ..., "boundingbox": ...}
+            >>> labels = {"segmentation": seg_tensor, "boundingbox": bbox_tensor}
             >>> get_label(labels)
-            (..., LabelType.SEGMENTATION)  # returns the first matching label
+            seg_tensor  # returns the first matching label
             >>> get_label(labels, LabelType.BOUNDINGBOX)
-            (..., LabelType.BOUNDINGBOX)  # returns the bounding box label
+            bbox_tensor # returns the bounding box label
             >>> get_label(labels, LabelType.CLASSIFICATION)
             IncompatibleException: Label 'classification' is missing from the dataset.
 
@@ -126,13 +193,18 @@ def get_label(
         @param labels: Labels from the dataset.
         @type label_type: LabelType | None
         @param label_type: Type of the label to extract.
-        @raises IncompatibleException: If the label is not found in the labels dictionary.
-        @raises NotImplementedError: If the module requires multiple labels. For such cases,
-            the `prepare` method should be overridden.
 
-        @rtype: tuple[Tensor, LabelType]
-        @return: Extracted label and its type.
+        @rtype: Tensor
+        @return: Extracted label
+
+        @raises ValueError: If the module requires multiple labels and the C{label_type} is not provided.
+        @raises IncompatibleException: If the label is not found in the labels dictionary.
         """
+        return self._get_label(labels, label_type)[0]
+
+    def _get_label(
+        self, labels: Labels, label_type: LabelType | None = None
+    ) -> tuple[Tensor, LabelType]:
         if label_type is None:
             if len(self.required_labels) == 1:
                 label_type = self.required_labels[0]
@@ -145,16 +217,9 @@ def get_label(
                 )
             return labels[task_name]
 
-        if len(self.required_labels) > 1:
-            raise NotImplementedError(
-                f"{self.name} requires multiple labels. You must provide the "
-                "`label_type` argument to extract the desired label."
-            )
-        for label, label_type in labels.values():
-            if label_type == self.required_labels[0]:
-                return label, label_type
-        raise IncompatibleException.from_missing_task(
-            self.required_labels[0].value, list(labels.keys()), self.name
+        raise ValueError(
+            f"{self.name} requires multiple labels. You must provide the "
+            "`label_type` argument to extract the desired label."
         )
 
     def get_input_tensors(
@@ -181,33 +246,37 @@ def get_input_tensors(
         @rtype: list[Tensor]
         @return: Extracted input tensors
 
-        @raises ValueError: If the task type is not supported by the node or if the task
-            is not present in the inputs.
+        @raises IncompatibleException: If the task type is not supported by the node.
+        @raises IncompatibleException: If the task is not present in the inputs.
 
-        @raises NotImplementedError: If the module requires multiple labels.
+        @raises ValueError: If the module requires multiple labels.
             For such cases, the `prepare` method should be overridden.
         """
         if task_type is not None:
             if isinstance(task_type, LabelType):
                 if task_type not in self.node_tasks:
-                    raise ValueError(
+                    raise IncompatibleException(
                         f"Task {task_type.value} is not supported by the node "
                         f"{self.node.name}."
                     )
                 return inputs[self.node_tasks[task_type]]
             else:
                 if task_type not in inputs:
-                    raise ValueError(f"Task {task_type} is not present in the inputs.")
+                    raise IncompatibleException(
+                        f"Task {task_type} is not present in the inputs."
+                    )
                 return inputs[task_type]
 
         if len(self.required_labels) > 1:
-            raise NotImplementedError(
+            raise ValueError(
                 f"{self.name} requires multiple labels, "
                 "you must provide the `task_type` argument to extract the desired input."
             )
         return inputs[self.node_tasks[self.required_labels[0]]]
 
-    def prepare(self, inputs: Packet[Tensor], labels: Labels) -> tuple[Unpack[Ts]]:
+    def prepare(
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[Unpack[Ts]]:
         """Prepares node outputs for the forward pass of the module.
 
         This default implementation selects the output and label based on
@@ -223,48 +292,63 @@ def prepare(self, inputs: Packet[Tensor], labels: Labels) -> tuple[Unpack[Ts]]:
 
         @rtype: tuple[Unpack[Ts]]
         @return: Prepared inputs. Should allow the following usage with the
-            L{forward} method:
+            L{forward} method::
 
                 >>> loss.forward(*loss.prepare(outputs, labels))
 
-        @raises NotImplementedError: If the module requires multiple labels.
-        @raises IncompatibleException: If the inputs are not compatible with the module.
+        @raises RuntimeError: If the module requires multiple labels and
+            is connected to a multi-task node. In this case, the default
+            implementation cannot be used and the C{prepare} method should be overridden.
+
+        @raises RuntimeError: If the C{tasks} attribute is not set on the node.
+        @raises RuntimeError: If the C{supported_labels} attribute is not set on the module.
         """
         if self.node._tasks is None:
-            raise ValueError(
+            raise RuntimeError(
                 f"{self.node.name} must have the `tasks` attribute specified "
                 f"for {self.name} to make use of the default `prepare` method."
             )
         if self.supported_labels is None:
-            raise ValueError(
+            raise RuntimeError(
                 f"{self.name} must have the `supported_labels` attribute "
                 "specified in order to use the default `prepare` method."
             )
         if len(self.supported_labels) > 1:
-            if len(self.node._tasks) > 1:
-                raise NotImplementedError(
+            if len(self.node_tasks) > 1:
+                raise RuntimeError(
                     f"{self.name} supports more than one label type"
                     f"and is connected to {self.node.name} node "
                     "which is a multi-task node. The default `prepare` "
                     "implementation cannot be used in this case."
                 )
             self.supported_labels = list(
-                set(self.supported_labels) & set(self.node._tasks)
+                set(self.supported_labels) & set(self.node_tasks)
             )
         x = self.get_input_tensors(inputs)
-        label, label_type = self.get_label(labels)
+        label, label_type = self._get_label(labels)
         if label_type in [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]:
-            if isinstance(x, list):
-                if len(x) == 1:
-                    x = x[0]
-                else:
-                    logger.warning(
-                        f"Module {self.name} expects a single tensor as input, "
-                        f"but got {len(x)} tensors. Using the last tensor. "
-                        f"If this is not the desired behavior, please override the "
-                        "`prepare` method of the attached module or the `wrap` "
-                        f"method of {self.node.name}."
-                    )
-                    x = x[-1]
+            if len(x) == 1:
+                x = x[0]
+            else:
+                logger.warning(
+                    f"Module {self.name} expects a single tensor as input, "
+                    f"but got {len(x)} tensors. Using the last tensor. "
+                    f"If this is not the desired behavior, please override the "
+                    "`prepare` method of the attached module or the `wrap` "
+                    f"method of {self.node.name}."
+                )
+                x = x[-1]
 
         return x, label  # type: ignore
+
+    def _check_node_type_override(self) -> None:
+        if "node" not in self.__annotations__:
+            return
+
+        node_type = self.__annotations__["node"]
+        with suppress(RuntimeError):
+            if not isinstance(self.node, node_type):
+                raise IncompatibleException(
+                    f"Module '{self.name}' is attached to the '{self.node.name}' node, "
+                    f"but '{self.name}' is only compatible with nodes of type '{node_type.__name__}'."
+                )
diff --git a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
index 6a28bff9..d25825cb 100644
--- a/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
+++ b/luxonis_train/attached_modules/losses/adaptive_detection_loss.py
@@ -1,31 +1,39 @@
-from typing import Literal, cast
+import logging
+from typing import Any, Literal, cast
 
 import torch
 import torch.nn.functional as F
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 from torchvision.ops import box_convert
 
+from luxonis_train.assigners import ATSSAssigner, TaskAlignedAssigner
 from luxonis_train.nodes import EfficientBBoxHead
-from luxonis_train.utils.assigners import ATSSAssigner, TaskAlignedAssigner
-from luxonis_train.utils.boxutils import (
-    IoUType,
+from luxonis_train.utils import (
+    Labels,
+    Packet,
     anchors_for_fpn_features,
     compute_iou_loss,
     dist2bbox,
 )
-from luxonis_train.utils.types import IncompatibleException, Labels, LabelType, Packet
+from luxonis_train.utils.boundingbox import IoUType
 
 from .base_loss import BaseLoss
 
+logger = logging.getLogger(__name__)
 
-class AdaptiveDetectionLoss(BaseLoss[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]):
+
+class AdaptiveDetectionLoss(
+    BaseLoss[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]
+):
     node: EfficientBBoxHead
     supported_labels = [LabelType.BOUNDINGBOX]
 
-    class NodePacket(Packet[Tensor]):
-        features: list[Tensor]
-        class_scores: Tensor
-        distributions: Tensor
+    anchors: Tensor
+    anchor_points: Tensor
+    n_anchors_list: list[int]
+    stride_tensor: Tensor
+    gt_bboxes_scale: Tensor
 
     def __init__(
         self,
@@ -34,7 +42,7 @@ def __init__(
         reduction: Literal["sum", "mean"] = "mean",
         class_loss_weight: float = 1.0,
         iou_loss_weight: float = 2.5,
-        **kwargs,
+        **kwargs: Any,
     ):
         """BBox loss adapted from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications
         <https://arxiv.org/pdf/2209.02976.pdf>}. It combines IoU based bbox regression loss and varifocal loss
@@ -51,23 +59,15 @@ def __init__(
         @param class_loss_weight: Weight of classification loss.
         @type iou_loss_weight: float
         @param iou_loss_weight: Weight of IoU loss.
-        @type kwargs: dict
-        @param kwargs: Additional arguments to pass to L{BaseLoss}.
         """
         super().__init__(**kwargs)
 
-        if not isinstance(self.node, EfficientBBoxHead):
-            raise IncompatibleException(
-                f"Loss `{self.name}` is only "
-                "compatible with nodes of type `EfficientBBoxHead`."
-            )
         self.iou_type: IoUType = iou_type
         self.reduction = reduction
-        self.n_classes = self.node.n_classes
         self.stride = self.node.stride
         self.grid_cell_size = self.node.grid_cell_size
         self.grid_cell_offset = self.node.grid_cell_offset
-        self.original_img_size = self.node.original_in_shape[1:]
+        self.original_img_size = self.original_in_shape[1:]
 
         self.n_warmup_epochs = n_warmup_epochs
         self.atts_assigner = ATSSAssigner(topk=9, n_classes=self.n_classes)
@@ -79,84 +79,41 @@ def __init__(
         self.class_loss_weight = class_loss_weight
         self.iou_loss_weight = iou_loss_weight
 
-        self.anchors = None
-        self.anchor_points = None
-        self.n_anchors_list = None
-        self.stride_tensor = None
-        self.gt_bboxes_scale = None
+        self._logged_assigner_change = False
 
     def prepare(
-        self, outputs: Packet[Tensor], labels: Labels
+        self, inputs: Packet[Tensor], labels: Labels
     ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
-        feats = self.get_input_tensors(outputs, "features")
-        pred_scores = self.get_input_tensors(outputs, "class_scores")[0]
-        pred_distri = self.get_input_tensors(outputs, "distributions")[0]
+        feats = self.get_input_tensors(inputs, "features")
+        pred_scores = self.get_input_tensors(inputs, "class_scores")[0]
+        pred_distri = self.get_input_tensors(inputs, "distributions")[0]
+
+        target = self.get_label(labels)
+
         batch_size = pred_scores.shape[0]
-        device = pred_scores.device
 
-        target = self.get_label(labels)[0]
-        if self.gt_bboxes_scale is None:
-            self.gt_bboxes_scale = torch.tensor(
-                [
-                    self.original_img_size[1],
-                    self.original_img_size[0],
-                    self.original_img_size[1],
-                    self.original_img_size[0],
-                ],
-                device=device,
-            )
-            (
-                self.anchors,
-                self.anchor_points,
-                self.n_anchors_list,
-                self.stride_tensor,
-            ) = anchors_for_fpn_features(
-                feats,
-                self.stride,
-                self.grid_cell_size,
-                self.grid_cell_offset,
-                multiply_with_stride=True,
-            )
-            self.anchor_points_strided = self.anchor_points / self.stride_tensor
+        self._init_parameters(feats)
 
-        target = self._preprocess_target(target, batch_size)
+        target = self._preprocess_bbox_target(target, batch_size)
         pred_bboxes = dist2bbox(pred_distri, self.anchor_points_strided)
 
         gt_labels = target[:, :, :1]
         gt_xyxy = target[:, :, 1:]
         mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float()
 
-        if self._epoch < self.n_warmup_epochs:
-            (
-                assigned_labels,
-                assigned_bboxes,
-                assigned_scores,
-                mask_positive,
-                _,
-            ) = self.atts_assigner(
-                self.anchors,
-                self.n_anchors_list,
-                gt_labels,
-                gt_xyxy,
-                mask_gt,
-                pred_bboxes.detach() * self.stride_tensor,
-            )
-        else:
-            # TODO: log change of assigner (once common Logger)
-            (
-                assigned_labels,
-                assigned_bboxes,
-                assigned_scores,
-                mask_positive,
-                _,
-            ) = self.tal_assigner(
-                pred_scores.detach(),
-                pred_bboxes.detach() * self.stride_tensor,
-                self.anchor_points,
-                gt_labels,
-                gt_xyxy,
-                mask_gt,
-            )
+        (
+            assigned_labels,
+            assigned_bboxes,
+            assigned_scores,
+            mask_positive,
+            _,
+        ) = self._run_assigner(
+            gt_labels,
+            gt_xyxy,
+            mask_gt,
+            pred_bboxes,
+            pred_scores,
+        )
 
         return (
             pred_bboxes,
@@ -176,8 +133,12 @@ def forward(
         assigned_scores: Tensor,
         mask_positive: Tensor,
     ):
-        one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[..., :-1]
-        loss_cls = self.varifocal_loss(pred_scores, assigned_scores, one_hot_label)
+        one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[
+            ..., :-1
+        ]
+        loss_cls = self.varifocal_loss(
+            pred_scores, assigned_scores, one_hot_label
+        )
 
         if assigned_scores.sum() > 1:
             loss_cls /= assigned_scores.sum()
@@ -192,17 +153,77 @@ def forward(
             bbox_format="xyxy",
         )[0]
 
-        loss = self.class_loss_weight * loss_cls + self.iou_loss_weight * loss_iou
+        loss = (
+            self.class_loss_weight * loss_cls + self.iou_loss_weight * loss_iou
+        )
 
         sub_losses = {"class": loss_cls.detach(), "iou": loss_iou.detach()}
 
         return loss, sub_losses
 
-    def _preprocess_target(self, target: Tensor, batch_size: int):
-        """Preprocess target in shape [batch_size, N, 5] where N is maximum number of
-        instances in one image."""
+    def _init_parameters(self, features: list[Tensor]):
+        if not hasattr(self, "gt_bboxes_scale"):
+            self.gt_bboxes_scale = torch.tensor(
+                [
+                    self.original_img_size[1],
+                    self.original_img_size[0],
+                    self.original_img_size[1],
+                    self.original_img_size[0],
+                ],
+                device=features[0].device,
+            )
+            (
+                self.anchors,
+                self.anchor_points,
+                self.n_anchors_list,
+                self.stride_tensor,
+            ) = anchors_for_fpn_features(
+                features,
+                self.stride,
+                self.grid_cell_size,
+                self.grid_cell_offset,
+                multiply_with_stride=True,
+            )
+            self.anchor_points_strided = (
+                self.anchor_points / self.stride_tensor
+            )
+
+    def _run_assigner(
+        self,
+        gt_labels: Tensor,
+        gt_xyxy: Tensor,
+        mask_gt: Tensor,
+        pred_bboxes: Tensor,
+        pred_scores: Tensor,
+    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+        if self._epoch < self.n_warmup_epochs:
+            return self.atts_assigner(
+                self.anchors,
+                self.n_anchors_list,
+                gt_labels,
+                gt_xyxy,
+                mask_gt,
+                pred_bboxes.detach() * self.stride_tensor,
+            )
+        else:
+            self._log_assigner_change()
+            return self.tal_assigner(
+                pred_scores.detach(),
+                pred_bboxes.detach() * self.stride_tensor,
+                self.anchor_points,
+                gt_labels,
+                gt_xyxy,
+                mask_gt,
+            )
+
+    def _preprocess_bbox_target(
+        self, target: Tensor, batch_size: int
+    ) -> Tensor:
+        """Preprocess target in shape [batch_size, N, 5] where N is the
+        maximum number of instances in one image."""
         sample_ids, counts = cast(
-            tuple[Tensor, Tensor], torch.unique(target[:, 0].int(), return_counts=True)
+            tuple[Tensor, Tensor],
+            torch.unique(target[:, 0].int(), return_counts=True),
         )
         c_max = int(counts.max()) if counts.numel() > 0 else 0
         out_target = torch.zeros(batch_size, c_max, 5, device=target.device)
@@ -214,6 +235,16 @@ def _preprocess_target(self, target: Tensor, batch_size: int):
         out_target[..., 1:] = box_convert(scaled_target, "xywh", "xyxy")
         return out_target
 
+    def _log_assigner_change(self):
+        if self._logged_assigner_change:
+            return
+
+        logger.info(
+            f"Switching to Task Aligned Assigner after {self.n_warmup_epochs} warmup epochs.",
+            stacklevel=2,
+        )
+        self._logged_assigner_change = True
+
 
 class VarifocalLoss(nn.Module):
     def __init__(self, alpha: float = 0.75, gamma: float = 2.0):
@@ -236,7 +267,8 @@ def forward(
         self, pred_score: Tensor, target_score: Tensor, label: Tensor
     ) -> Tensor:
         weight = (
-            self.alpha * pred_score.pow(self.gamma) * (1 - label) + target_score * label
+            self.alpha * pred_score.pow(self.gamma) * (1 - label)
+            + target_score * label
         )
         ce_loss = F.binary_cross_entropy(
             pred_score.float(), target_score.float(), reduction="none"
diff --git a/luxonis_train/attached_modules/losses/base_loss.py b/luxonis_train/attached_modules/losses/base_loss.py
index 89ce8d8c..7a69d0d8 100644
--- a/luxonis_train/attached_modules/losses/base_loss.py
+++ b/luxonis_train/attached_modules/losses/base_loss.py
@@ -17,19 +17,23 @@ class BaseLoss(
 ):
     """A base class for all loss functions.
 
-    This class defines the basic interface for all loss functions. It utilizes automatic
-    registration of defined subclasses to a L{LOSSES} registry.
+    This class defines the basic interface for all loss functions. It
+    utilizes automatic registration of defined subclasses to a L{LOSSES}
+    registry.
     """
 
     @abstractmethod
-    def forward(self, *args: Unpack[Ts]) -> Tensor | tuple[Tensor, dict[str, Tensor]]:
+    def forward(
+        self, *args: Unpack[Ts]
+    ) -> Tensor | tuple[Tensor, dict[str, Tensor]]:
         """Forward pass of the loss function.
 
         @type args: Unpack[Ts]
         @param args: Prepared inputs from the L{prepare} method.
         @rtype: Tensor | tuple[Tensor, dict[str, Tensor]]
-        @return: The main loss and optional a dictionary of sublosses (for logging).
-            Only the main loss is used for backpropagation.
+        @return: The main loss and optional a dictionary of sublosses
+            (for logging). Only the main loss is used for
+            backpropagation.
         """
         ...
 
@@ -45,8 +49,10 @@ def run(
         @type labels: L{Labels}
         @param labels: Labels from the dataset.
         @rtype: Tensor | tuple[Tensor, dict[str, Tensor]]
-        @return: The main loss and optional a dictionary of sublosses (for logging).
-            Only the main loss is used for backpropagation.
-        @raises IncompatibleException: If the inputs are not compatible with the module.
+        @return: The main loss and optional a dictionary of sublosses
+            (for logging). Only the main loss is used for
+            backpropagation.
+        @raises IncompatibleException: If the inputs are not compatible
+            with the module.
         """
         return self(*self.prepare(inputs, labels))
diff --git a/luxonis_train/attached_modules/losses/bce_with_logits.py b/luxonis_train/attached_modules/losses/bce_with_logits.py
index 442a89c3..b759d06b 100644
--- a/luxonis_train/attached_modules/losses/bce_with_logits.py
+++ b/luxonis_train/attached_modules/losses/bce_with_logits.py
@@ -1,4 +1,4 @@
-from typing import Literal
+from typing import Any, Literal
 
 import torch
 from luxonis_ml.data import LabelType
@@ -15,35 +15,39 @@ def __init__(
         weight: list[float] | None = None,
         reduction: Literal["none", "mean", "sum"] = "mean",
         pos_weight: Tensor | None = None,
-        **kwargs,
+        **kwargs: Any,
     ):
-        """This loss combines a L{nn.Sigmoid} layer and the L{nn.BCELoss} in one single
-        class. This version is more numerically stable than using a plain C{Sigmoid}
-        followed by a {BCELoss} as, by combining the operations into one layer, we take
-        advantage of the log-sum-exp trick for numerical stability.
+        """This loss combines a L{nn.Sigmoid} layer and the
+        L{nn.BCELoss} in one single class. This version is more
+        numerically stable than using a plain C{Sigmoid} followed by a
+        {BCELoss} as, by combining the operations into one layer, we
+        take advantage of the log-sum-exp trick for numerical stability.
 
         @type weight: list[float] | None
-        @param weight: a manual rescaling weight given to the loss of each batch
-            element. If given, has to be a list of length C{nbatch}. Defaults to
-            C{None}.
+        @param weight: a manual rescaling weight given to the loss of
+            each batch element. If given, has to be a list of length
+            C{nbatch}. Defaults to C{None}.
         @type reduction: Literal["none", "mean", "sum"]
-        @param reduction: Specifies the reduction to apply to the output: C{"none"} |
-            C{"mean"} | C{"sum"}. C{"none"}: no reduction will be applied, C{"mean"}:
-            the sum of the output will be divided by the number of elements in the
-            output, C{"sum"}: the output will be summed. Note: C{size_average} and
-            C{reduce} are in the process of being deprecated, and in the meantime,
-            specifying either of those two args will override C{reduction}. Defaults to
-            C{"mean"}.
+        @param reduction: Specifies the reduction to apply to the
+            output: C{"none"} | C{"mean"} | C{"sum"}. C{"none"}: no
+            reduction will be applied, C{"mean"}: the sum of the output
+            will be divided by the number of elements in the output,
+            C{"sum"}: the output will be summed. Note: C{size_average}
+            and C{reduce} are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will
+            override C{reduction}. Defaults to C{"mean"}.
         @type pos_weight: Tensor | None
-        @param pos_weight: a weight of positive examples to be broadcasted with target.
-            Must be a tensor with equal size along the class dimension to the number of
-            classes. Pay close attention to PyTorch's broadcasting semantics in order to
-            achieve the desired operations. For a target of size [B, C, H, W] (where B
-            is batch size) pos_weight of size [B, C, H, W] will apply different
-            pos_weights to each element of the batch or [C, H, W] the same pos_weights
-            across the batch. To apply the same positive weight along all spacial
-            dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1]. Defaults to
-            C{None}.
+        @param pos_weight: a weight of positive examples to be
+            broadcasted with target. Must be a tensor with equal size
+            along the class dimension to the number of classes. Pay
+            close attention to PyTorch's broadcasting semantics in order
+            to achieve the desired operations. For a target of size [B,
+            C, H, W] (where B is batch size) pos_weight of size [B, C,
+            H, W] will apply different pos_weights to each element of
+            the batch or [C, H, W] the same pos_weights across the
+            batch. To apply the same positive weight along all spacial
+            dimensions for a 2D multi-class target [C, H, W] use: [C, 1,
+            1]. Defaults to C{None}.
         """
         super().__init__(**kwargs)
         self.criterion = nn.BCEWithLogitsLoss(
@@ -53,6 +57,15 @@ def __init__(
         )
 
     def forward(self, predictions: Tensor, target: Tensor) -> Tensor:
+        """Computes the BCE loss from logits.
+
+        @type predictions: Tensor
+        @param predictions: Network predictions of shape (N, C, ...)
+        @type target: Tensor
+        @param target: A tensor of the same shape as predictions.
+        @rtype: Tensor
+        @return: A scalar tensor.
+        """
         if predictions.shape != target.shape:
             raise RuntimeError(
                 f"Target tensor dimension ({target.shape}) and preds tensor "
diff --git a/luxonis_train/attached_modules/losses/cross_entropy.py b/luxonis_train/attached_modules/losses/cross_entropy.py
index 05a0f524..4be0cfdc 100644
--- a/luxonis_train/attached_modules/losses/cross_entropy.py
+++ b/luxonis_train/attached_modules/losses/cross_entropy.py
@@ -1,5 +1,5 @@
 from logging import getLogger
-from typing import Literal
+from typing import Any, Literal
 
 import torch
 import torch.nn as nn
@@ -9,12 +9,11 @@
 from .base_loss import BaseLoss
 
 logger = getLogger(__name__)
-was_logged = False
 
 
 class CrossEntropyLoss(BaseLoss[Tensor, Tensor]):
-    """This criterion computes the cross entropy loss between input logits and
-    target."""
+    """This criterion computes the cross entropy loss between input
+    logits and target."""
 
     supported_labels = [LabelType.SEGMENTATION, LabelType.CLASSIFICATION]
 
@@ -24,7 +23,7 @@ def __init__(
         ignore_index: int = -100,
         reduction: Literal["none", "mean", "sum"] = "mean",
         label_smoothing: float = 0.0,
-        **kwargs,
+        **kwargs: Any,
     ):
         super().__init__(**kwargs)
 
@@ -34,19 +33,19 @@ def __init__(
             reduction=reduction,
             label_smoothing=label_smoothing,
         )
+        self._was_logged = False
 
     def forward(self, preds: Tensor, target: Tensor) -> Tensor:
-        global was_logged
         if preds.ndim == target.ndim:
             ch_dim = 1 if preds.ndim > 1 else 0
             if preds.shape[ch_dim] == 1:
-                if not was_logged:
+                if not self._was_logged:
                     logger.warning(
                         "`CrossEntropyLoss` expects at least 2 classes. "
                         "Attempting to fix by adding a dummy channel. "
                         "If you want to be sure, use `BCEWithLogitsLoss` instead."
                     )
-                    was_logged = True
+                    self._was_logged = True
                 preds = torch.cat([torch.zeros_like(preds), preds], dim=ch_dim)
                 if target.shape[ch_dim] == 1:
                     target = torch.cat([1 - target, target], dim=ch_dim)
diff --git a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
index 2e6621de..d996dcfd 100644
--- a/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
+++ b/luxonis_train/attached_modules/losses/efficient_keypoint_bbox_loss.py
@@ -1,52 +1,44 @@
-from typing import Literal, cast
+from typing import Any, Literal
 
 import torch
 import torch.nn.functional as F
-from torch import Tensor, nn
-from torchvision.ops import box_convert
+from luxonis_ml.data import LabelType
+from torch import Tensor
 
-from luxonis_train.attached_modules.metrics.object_keypoint_similarity import (
-    get_area_factor,
-    get_sigmas,
-)
+from luxonis_train.attached_modules.losses import AdaptiveDetectionLoss
 from luxonis_train.nodes import EfficientKeypointBBoxHead
-from luxonis_train.utils.assigners import ATSSAssigner, TaskAlignedAssigner
-from luxonis_train.utils.boxutils import (
-    IoUType,
-    anchors_for_fpn_features,
+from luxonis_train.utils import (
+    Labels,
+    Packet,
     compute_iou_loss,
     dist2bbox,
+    get_sigmas,
+    get_with_default,
 )
-from luxonis_train.utils.types import IncompatibleException, Labels, LabelType, Packet
+from luxonis_train.utils.boundingbox import IoUType
 
-from .base_loss import BaseLoss
 from .bce_with_logits import BCEWithLogitsLoss
 
 
-class EfficientKeypointBBoxLoss(
-    BaseLoss[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]
-):
+class EfficientKeypointBBoxLoss(AdaptiveDetectionLoss):
     node: EfficientKeypointBBoxHead
     supported_labels = [(LabelType.BOUNDINGBOX, LabelType.KEYPOINTS)]
 
-    class NodePacket(Packet[Tensor]):
-        features: list[Tensor]
-        class_scores: Tensor
-        distributions: Tensor
+    gt_kpts_scale: Tensor
 
     def __init__(
         self,
         n_warmup_epochs: int = 4,
         iou_type: IoUType = "giou",
         reduction: Literal["sum", "mean"] = "mean",
-        class_bbox_loss_weight: float = 1.0,
+        class_loss_weight: float = 1.0,
         iou_loss_weight: float = 2.5,
         viz_pw: float = 1.0,
         regr_kpts_loss_weight: float = 1.5,
         vis_kpts_loss_weight: float = 1.0,
         sigmas: list[float] | None = None,
         area_factor: float | None = None,
-        **kwargs,
+        **kwargs: Any,
     ):
         """BBox loss adapted from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial Applications
         <https://arxiv.org/pdf/2209.02976.pdf>}. It combines IoU based bbox regression loss and varifocal loss
@@ -55,12 +47,12 @@ def __init__(
 
         @type n_warmup_epochs: int
         @param n_warmup_epochs: Number of epochs where ATSS assigner is used, after that we switch to TAL assigner.
-        @type iou_type: L{IoUType}
+        @type iou_type: Literal["none", "giou", "diou", "ciou", "siou"]
         @param iou_type: IoU type used for bbox regression loss.
         @type reduction: Literal["sum", "mean"]
         @param reduction: Reduction type for loss.
-        @type class_bbox_loss_weight: float
-        @param class_bbox_loss_weight: Weight of classification loss for bounding boxes.
+        @type class_loss_weight: float
+        @param class_loss_weight: Weight of classification loss for bounding boxes.
         @type regr_kpts_loss_weight: float
         @param regr_kpts_loss_weight: Weight of regression loss for keypoints.
         @type vis_kpts_loss_weight: float
@@ -71,153 +63,100 @@ def __init__(
         @param sigmas: Sigmas used in KeypointLoss for OKS metric. If None then use COCO ones if possible or default ones. Defaults to C{None}.
         @type area_factor: float | None
         @param area_factor: Factor by which we multiply bbox area which is used in KeypointLoss. If None then use default one. Defaults to C{None}.
-        @type kwargs: dict
-        @param kwargs: Additional arguments to pass to L{BaseLoss}.
         """
-        super().__init__(**kwargs)
+        super().__init__(
+            n_warmup_epochs=n_warmup_epochs,
+            iou_type=iou_type,
+            reduction=reduction,
+            class_loss_weight=class_loss_weight,
+            iou_loss_weight=iou_loss_weight,
+            **kwargs,
+        )
 
-        if not isinstance(self.node, EfficientKeypointBBoxHead):
-            raise IncompatibleException(
-                f"Loss `{self.name}` is only "
-                "compatible with nodes of type `EfficientKeypointBBoxHead`."
-            )
-        self.iou_type: IoUType = iou_type
-        self.reduction = reduction
-        self.n_classes = self.node.n_classes
-        self.stride = self.node.stride
-        self.grid_cell_size = self.node.grid_cell_size
-        self.grid_cell_offset = self.node.grid_cell_offset
-        self.original_img_size = self.node.original_in_shape[1:]
-        self.n_heads = self.node.n_heads
-        self.n_kps = self.node.n_keypoints
-
-        self.b_cross_entropy = BCEWithLogitsLoss(pos_weight=torch.tensor([viz_pw]))
+        self.b_cross_entropy = BCEWithLogitsLoss(
+            pos_weight=torch.tensor([viz_pw])
+        )
         self.sigmas = get_sigmas(
-            sigmas=sigmas, n_keypoints=self.n_kps, class_name=self.name
+            sigmas=sigmas,
+            n_keypoints=self.n_keypoints,
+            caller_name=self.name,
         )
-        self.area_factor = get_area_factor(area_factor, class_name=self.name)
-
-        self.n_warmup_epochs = n_warmup_epochs
-        self.atts_assigner = ATSSAssigner(topk=9, n_classes=self.n_classes)
-        self.tal_assigner = TaskAlignedAssigner(
-            topk=13, n_classes=self.n_classes, alpha=1.0, beta=6.0
+        self.area_factor = get_with_default(
+            area_factor, "bbox area scaling", self.name, default=0.53
         )
-
-        self.varifocal_loss = VarifocalLoss()
-        self.class_bbox_loss_weight = class_bbox_loss_weight
-        self.iou_loss_weight = iou_loss_weight
         self.regr_kpts_loss_weight = regr_kpts_loss_weight
         self.vis_kpts_loss_weight = vis_kpts_loss_weight
 
     def prepare(
-        self, outputs: Packet[Tensor], labels: Labels
-    ) -> tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
-        feats = self.get_input_tensors(outputs, "features")
-        pred_scores = self.get_input_tensors(outputs, "class_scores")[0]
-        pred_distri = self.get_input_tensors(outputs, "distributions")[0]
-        pred_kpts = self.get_input_tensors(outputs, "keypoints_raw")[0]
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[
+        Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor
+    ]:
+        feats = self.get_input_tensors(inputs, "features")
+        pred_scores = self.get_input_tensors(inputs, "class_scores")[0]
+        pred_distri = self.get_input_tensors(inputs, "distributions")[0]
+        pred_kpts = self.get_input_tensors(inputs, "keypoints_raw")[0]
+
+        target_kpts = self.get_label(labels, LabelType.KEYPOINTS)
+        target_bbox = self.get_label(labels, LabelType.BOUNDINGBOX)
 
         batch_size = pred_scores.shape[0]
-        device = pred_scores.device
-
-        target_kpts = self.get_label(labels, LabelType.KEYPOINTS)[0]
-        target_bbox = self.get_label(labels, LabelType.BOUNDINGBOX)[0]
         n_kpts = (target_kpts.shape[1] - 2) // 3
 
-        gt_bboxes_scale = torch.tensor(
-            [
-                self.original_img_size[1],
-                self.original_img_size[0],
-                self.original_img_size[1],
-                self.original_img_size[0],
-            ],
-            device=device,
-        )
-        gt_kpts_scale = torch.tensor(
-            [
-                self.original_img_size[1],
-                self.original_img_size[0],
-            ],
-            device=device,
-        )
-        (
-            anchors,
-            anchor_points,
-            n_anchors_list,
-            stride_tensor,
-        ) = anchors_for_fpn_features(
-            feats,
-            self.stride,
-            self.grid_cell_size,
-            self.grid_cell_offset,
-            multiply_with_stride=True,
-        )
+        self._init_parameters(feats)
 
-        anchor_points_strided = anchor_points / stride_tensor
-        pred_bboxes = dist2bbox(pred_distri, anchor_points_strided)
+        pred_bboxes = dist2bbox(pred_distri, self.anchor_points_strided)
         pred_kpts = self.dist2kpts_noscale(
-            anchor_points_strided, pred_kpts.view(batch_size, -1, n_kpts, 3)
+            self.anchor_points_strided,
+            pred_kpts.view(
+                batch_size,
+                -1,
+                n_kpts,
+                3,
+            ),
         )
 
-        target_bbox = self._preprocess_bbox_target(
-            target_bbox, batch_size, gt_bboxes_scale
-        )
+        target_bbox = self._preprocess_bbox_target(target_bbox, batch_size)
 
         gt_bbox_labels = target_bbox[:, :, :1]
         gt_xyxy = target_bbox[:, :, 1:]
         mask_gt = (gt_xyxy.sum(-1, keepdim=True) > 0).float()
-
-        if self._epoch < self.n_warmup_epochs:
-            (
-                assigned_labels,
-                assigned_bboxes,
-                assigned_scores,
-                mask_positive,
-                assigned_gt_idx,
-            ) = self.atts_assigner(
-                anchors,
-                n_anchors_list,
-                gt_bbox_labels,
-                gt_xyxy,
-                mask_gt,
-                pred_bboxes.detach() * stride_tensor,
-            )
-        else:
-            (
-                assigned_labels,
-                assigned_bboxes,
-                assigned_scores,
-                mask_positive,
-                assigned_gt_idx,
-            ) = self.tal_assigner(
-                pred_scores.detach(),
-                pred_bboxes.detach() * stride_tensor,
-                anchor_points,
-                gt_bbox_labels,
-                gt_xyxy,
-                mask_gt,
-            )
+        (
+            assigned_labels,
+            assigned_bboxes,
+            assigned_scores,
+            mask_positive,
+            assigned_gt_idx,
+        ) = self._run_assigner(
+            gt_bbox_labels,
+            gt_xyxy,
+            mask_gt,
+            pred_bboxes,
+            pred_scores,
+        )
 
         batched_kpts = self._preprocess_kpts_target(
-            target_kpts, batch_size, gt_kpts_scale
+            target_kpts, batch_size, self.gt_kpts_scale
         )
         assigned_gt_idx_expanded = assigned_gt_idx.unsqueeze(-1).unsqueeze(-1)
         selected_keypoints = batched_kpts.gather(
-            1, assigned_gt_idx_expanded.expand(-1, -1, self.n_kps, 3)
+            1, assigned_gt_idx_expanded.expand(-1, -1, self.n_keypoints, 3)
         )
         xy_components = selected_keypoints[:, :, :, :2]
-        normalized_xy = xy_components / stride_tensor.view(1, -1, 1, 1)
+        normalized_xy = xy_components / self.stride_tensor.view(1, -1, 1, 1)
         selected_keypoints = torch.cat(
             (normalized_xy, selected_keypoints[:, :, :, 2:]), dim=-1
         )
         gt_kpt = selected_keypoints[mask_positive]
         pred_kpts = pred_kpts[mask_positive]
-        assigned_bboxes = assigned_bboxes / stride_tensor
+        assigned_bboxes = assigned_bboxes / self.stride_tensor
 
         area = (
-            assigned_bboxes[mask_positive][:, 0] - assigned_bboxes[mask_positive][:, 2]
+            assigned_bboxes[mask_positive][:, 0]
+            - assigned_bboxes[mask_positive][:, 2]
         ) * (
-            assigned_bboxes[mask_positive][:, 1] - assigned_bboxes[mask_positive][:, 3]
+            assigned_bboxes[mask_positive][:, 1]
+            - assigned_bboxes[mask_positive][:, 3]
         )
 
         return (
@@ -256,8 +195,12 @@ def forward(
         ).mean()
         visibility_loss = self.b_cross_entropy.forward(pred_kpts[..., 2], mask)
 
-        one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[..., :-1]
-        loss_cls = self.varifocal_loss(pred_scores, assigned_scores, one_hot_label)
+        one_hot_label = F.one_hot(assigned_labels.long(), self.n_classes + 1)[
+            ..., :-1
+        ]
+        loss_cls = self.varifocal_loss(
+            pred_scores, assigned_scores, one_hot_label
+        )
 
         if assigned_scores.sum() > 1:
             loss_cls /= assigned_scores.sum()
@@ -273,7 +216,7 @@ def forward(
         )[0]
 
         loss = (
-            self.class_bbox_loss_weight * loss_cls
+            self.class_loss_weight * loss_cls
             + self.iou_loss_weight * loss_iou
             + regression_loss * self.regr_kpts_loss_weight
             + visibility_loss * self.vis_kpts_loss_weight
@@ -288,49 +231,32 @@ def forward(
 
         return loss, sub_losses
 
-    def _preprocess_bbox_target(
-        self, bbox_target: Tensor, batch_size: int, scale_tensor: Tensor
-    ) -> Tensor:
-        """Preprocess target bboxes in shape [batch_size, N, 5] where N is maximum
-        number of instances in one image."""
-        sample_ids, counts = cast(
-            tuple[Tensor, Tensor],
-            torch.unique(bbox_target[:, 0].int(), return_counts=True),
-        )
-        c_max = int(counts.max()) if counts.numel() > 0 else 0
-        out_target = torch.zeros(batch_size, c_max, 5, device=bbox_target.device)
-        out_target[:, :, 0] = -1
-        for id, count in zip(sample_ids, counts):
-            out_target[id, :count] = bbox_target[bbox_target[:, 0] == id][:, 1:]
-
-        scaled_target = out_target[:, :, 1:5] * scale_tensor
-        out_target[..., 1:] = box_convert(scaled_target, "xywh", "xyxy")
-        return out_target
-
     def _preprocess_kpts_target(
         self, kpts_target: Tensor, batch_size: int, scale_tensor: Tensor
     ) -> Tensor:
-        """Preprocesses the target keypoints in shape [batch_size, N, n_keypoints, 3]
-        where N is the maximum number of keypoints in one image."""
+        """Preprocesses the target keypoints in shape [batch_size, N,
+        n_keypoints, 3] where N is the maximum number of keypoints in
+        one image."""
 
         _, counts = torch.unique(kpts_target[:, 0].int(), return_counts=True)
         max_kpts = int(counts.max()) if counts.numel() > 0 else 0
         batched_keypoints = torch.zeros(
-            (batch_size, max_kpts, self.n_kps, 3), device=kpts_target.device
+            (batch_size, max_kpts, self.n_keypoints, 3),
+            device=kpts_target.device,
         )
         for i in range(batch_size):
             keypoints_i = kpts_target[kpts_target[:, 0] == i]
             scaled_keypoints_i = keypoints_i[:, 2:].clone()
-            batched_keypoints[i, : keypoints_i.shape[0]] = scaled_keypoints_i.view(
-                -1, self.n_kps, 3
+            batched_keypoints[i, : keypoints_i.shape[0]] = (
+                scaled_keypoints_i.view(-1, self.n_keypoints, 3)
             )
             batched_keypoints[i, :, :, :2] *= scale_tensor[:2]
 
         return batched_keypoints
 
     def dist2kpts_noscale(self, anchor_points: Tensor, kpts: Tensor) -> Tensor:
-        """Adjusts and scales predicted keypoints relative to anchor points without
-        considering image stride."""
+        """Adjusts and scales predicted keypoints relative to anchor
+        points without considering image stride."""
         adj_kpts = kpts.clone()
         scale = 2.0
         x_adj = anchor_points[:, [0]] - 0.5
@@ -341,32 +267,13 @@ def dist2kpts_noscale(self, anchor_points: Tensor, kpts: Tensor) -> Tensor:
         adj_kpts[..., 1] += y_adj
         return adj_kpts
 
-
-class VarifocalLoss(nn.Module):
-    def __init__(self, alpha: float = 0.75, gamma: float = 2.0):
-        """Varifocal Loss is a loss function for training a dense object detector to predict
-        the IoU-aware classification score, inspired by focal loss.
-        Code is adapted from: U{https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/models/losses.py}
-
-        @type alpha: float
-        @param alpha: alpha parameter in focal loss, default is 0.75.
-        @type gamma: float
-        @param gamma: gamma parameter in focal loss, default is 2.0.
-        """
-
-        super().__init__()
-
-        self.alpha = alpha
-        self.gamma = gamma
-
-    def forward(
-        self, pred_score: Tensor, target_score: Tensor, label: Tensor
-    ) -> Tensor:
-        weight = (
-            self.alpha * pred_score.pow(self.gamma) * (1 - label) + target_score * label
-        )
-        ce_loss = F.binary_cross_entropy(
-            pred_score.float(), target_score.float(), reduction="none"
+    def _init_parameters(self, features: list[Tensor]):
+        device = features[0].device
+        super()._init_parameters(features)
+        self.gt_kpts_scale = torch.tensor(
+            [
+                self.original_img_size[1],
+                self.original_img_size[0],
+            ],
+            device=device,
         )
-        loss = (ce_loss * weight).sum()
-        return loss
diff --git a/luxonis_train/attached_modules/losses/implicit_keypoint_bbox_loss.py b/luxonis_train/attached_modules/losses/implicit_keypoint_bbox_loss.py
index d174c555..8c9230ae 100644
--- a/luxonis_train/attached_modules/losses/implicit_keypoint_bbox_loss.py
+++ b/luxonis_train/attached_modules/losses/implicit_keypoint_bbox_loss.py
@@ -1,17 +1,20 @@
-from typing import cast
+import logging
+from typing import Any, cast
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor
 from torchvision.ops import box_convert
 
 from luxonis_train.attached_modules.losses.keypoint_loss import KeypointLoss
 from luxonis_train.nodes import ImplicitKeypointBBoxHead
-from luxonis_train.utils.boxutils import (
+from luxonis_train.utils import (
+    Labels,
+    Packet,
     compute_iou_loss,
     match_to_anchor,
     process_bbox_predictions,
 )
-from luxonis_train.utils.types import IncompatibleException, Labels, LabelType, Packet
 
 from .base_loss import BaseLoss
 from .bce_with_logits import BCEWithLogitsLoss
@@ -25,7 +28,10 @@
     list[Tensor],
 ]
 
+logger = logging.getLogger(__name__)
 
+
+# TODO: BROKEN!
 class ImplicitKeypointBBoxLoss(BaseLoss[list[Tensor], KeypointTargetType]):
     node: ImplicitKeypointBBoxHead
     supported_labels = [(LabelType.BOUNDINGBOX, LabelType.KEYPOINTS)]
@@ -47,10 +53,10 @@ def __init__(
         anchor_threshold: float = 4.0,
         bias: float = 0.5,
         balance: list[float] | None = None,
-        **kwargs,
+        **kwargs: Any,
     ):
-        """Joint loss for keypoint and box predictions for cases where the keypoints and
-        boxes are inherently linked.
+        """Joint loss for keypoint and box predictions for cases where
+        the keypoints and boxes are inherently linked.
 
         Based on U{YOLO-Pose: Enhancing YOLO for Multi Person Pose Estimation Using Object
         Keypoint Similarity Loss<https://arxiv.org/ftp/arxiv/papers/2204/2204.06806.pdf>}.
@@ -89,34 +95,29 @@ def __init__(
 
         super().__init__(**kwargs)
 
-        if not isinstance(self.node, ImplicitKeypointBBoxHead):
-            raise IncompatibleException(
-                f"Loss `{self.name}` is only "
-                "compatible with nodes of type `ImplicitKeypointBBoxHead`."
-            )
-        self.n_classes = self.node.n_classes
-        self.n_keypoints = self.node.n_keypoints
         self.n_anchors = self.node.n_anchors
-        self.num_heads = self.node.num_heads
+        self.n_heads = self.node.n_heads
         self.box_offset = self.node.box_offset
         self.anchors = self.node.anchors
         self.balance = balance or [4.0, 1.0, 0.4]
-        if len(self.balance) < self.num_heads:
-            raise ValueError(
-                f"Balance list must have at least {self.num_heads} elements."
+        if len(self.balance) < self.n_heads:
+            logger.warning(
+                f"Balance list must have at least {self.n_heads} elements."
+                "Filling the rest with 1.0."
             )
+            self.balance += [1.0] * (self.n_heads - len(self.balance))
 
         self.min_objectness_iou = min_objectness_iou
         self.bbox_weight = bbox_loss_weight
         self.class_weight = class_loss_weight
         self.objectness_weight = objectness_loss_weight
-        self.kpt_visibility_weight = keypoint_visibility_loss_weight
-        self.keypoint_regression_loss_weight = keypoint_regression_loss_weight
         self.anchor_threshold = anchor_threshold
 
         self.bias = bias
 
-        self.b_cross_entropy = BCEWithLogitsLoss(pos_weight=torch.tensor([obj_pw]))
+        self.b_cross_entropy = BCEWithLogitsLoss(
+            pos_weight=torch.tensor([obj_pw])
+        )
         self.class_loss = SmoothBCEWithLogitsLoss(
             label_smoothing=label_smoothing,
             bce_pow=cls_pw,
@@ -126,6 +127,8 @@ def __init__(
             bce_power=viz_pw,
             sigmas=sigmas,
             area_factor=area_factor,
+            regression_loss_weight=keypoint_regression_loss_weight,
+            visibility_loss_weight=keypoint_visibility_loss_weight,
         )
 
         self.positive_smooth_const = 1 - 0.5 * label_smoothing
@@ -134,38 +137,44 @@ def __init__(
     def prepare(
         self, outputs: Packet[Tensor], labels: Labels
     ) -> tuple[list[Tensor], KeypointTargetType]:
-        """Prepares the labels to be in the correct format for loss calculation.
+        """Prepares the labels to be in the correct format for loss
+        calculation.
 
         @type outputs: Packet[Tensor]
         @param outputs: Output from the forward pass.
         @type labels: L{Labels}
         @param labels: Dictionary containing the labels.
-        @rtype: tuple[list[Tensor], tuple[list[Tensor], list[Tensor], list[Tensor],
-            list[tuple[Tensor, Tensor, Tensor, Tensor]], list[Tensor]]]
-        @return: Tuple containing the original output and the postprocessed labels. The
-            processed labels are a tuple containing the class targets, box targets,
-            keypoint targets, indices and anchors. Indicies are a tuple containing
-            vectors of indices for batch, anchor, feature y and feature x dimensions,
-            respectively. They are all of shape (n_targets,). The indices are used to
-            index the output tensors of shape (batch_size, n_anchors, feature_height,
-            feature_width, n_classes + box_offset + n_keypoints * 3) to get a tensor of
-            shape (n_targets, n_classes + box_offset + n_keypoints * 3).
+        @rtype: tuple[list[Tensor], tuple[list[Tensor], list[Tensor],
+            list[Tensor], list[tuple[Tensor, Tensor, Tensor, Tensor]],
+            list[Tensor]]]
+        @return: Tuple containing the original output and the
+            postprocessed labels. The processed labels are a tuple
+            containing the class targets, box targets, keypoint targets,
+            indices and anchors. Indicies are a tuple containing vectors
+            of indices for batch, anchor, feature y and feature x
+            dimensions, respectively. They are all of shape
+            (n_targets,). The indices are used to index the output
+            tensors of shape (batch_size, n_anchors, feature_height,
+            feature_width, n_classes + box_offset + n_keypoints * 3) to
+            get a tensor of shape (n_targets, n_classes + box_offset +
+            n_keypoints * 3).
         """
         predictions = self.get_input_tensors(outputs, "features")
 
-        kpts = self.get_label(labels, LabelType.KEYPOINTS)[0]
-        boxes = self.get_label(labels, LabelType.BOUNDINGBOX)[0]
+        kpt_label = self.get_label(labels, LabelType.KEYPOINTS)
+        bbox_label = self.get_label(labels, LabelType.BOUNDINGBOX)
 
-        nkpts = (kpts.shape[1] - 2) // 3
-        targets = torch.zeros((len(boxes), nkpts * 3 + self.box_offset + 1))
-        targets[:, :2] = boxes[:, :2]
+        targets = torch.zeros(
+            (kpt_label.shape[0], self.n_keypoints * 3 + self.box_offset + 1)
+        )
+        targets[:, :2] = kpt_label[:, :2]
         targets[:, 2 : self.box_offset + 1] = box_convert(
-            boxes[:, 2:], "xywh", "cxcywh"
+            bbox_label[:, 2:], "xywh", "cxcywh"
         )
 
-        targets[:, self.box_offset + 1 :: 3] = kpts[:, 2::3]  # insert kp x coordinates
-        targets[:, self.box_offset + 2 :: 3] = kpts[:, 3::3]  # insert kp y coordinates
-        targets[:, self.box_offset + 3 :: 3] = kpts[:, 4::3]  # insert kp visibility
+        # insert keypoints
+        for i in range(1, 4):
+            targets[:, self.box_offset + i :: 3] = kpt_label[:, i + 1 :: 3]
 
         n_targets = targets.shape[0]
 
@@ -176,21 +185,26 @@ def prepare(
         anchors: list[Tensor] = []
 
         anchor_indices = (
-            torch.arange(self.n_anchors, device=targets.device, dtype=torch.float32)
+            torch.arange(
+                self.n_anchors, device=targets.device, dtype=torch.float32
+            )
             .reshape(self.n_anchors, 1)
             .repeat(1, n_targets)
             .unsqueeze(-1)
         )
-        targets = torch.cat((targets.repeat(self.n_anchors, 1, 1), anchor_indices), 2)
+        targets = torch.cat(
+            (targets.repeat(self.n_anchors, 1, 1), anchor_indices), 2
+        )
 
         xy_deltas = (
             torch.tensor(
-                [[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]], device=targets.device
+                [[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]],
+                device=targets.device,
             ).float()
             * self.bias
         )
 
-        for i in range(self.num_heads):
+        for i in range(self.n_heads):
             anchor = self.anchors[i]
             feature_height, feature_width = predictions[i].shape[2:4]
             scaled_targets, xy_shifts = match_to_anchor(
@@ -251,9 +265,15 @@ def forward(
             "kpt_regression": torch.tensor(0.0, device=device),
         }
 
-        for pred, class_target, box_target, kpt_target, index, anchor, balance in zip(
-            predictions, *targets, self.balance
-        ):
+        for (
+            pred,
+            class_target,
+            box_target,
+            kpt_target,
+            index,
+            anchor,
+            balance,
+        ) in zip(predictions, *targets, self.balance):
             obj_targets = torch.zeros_like(pred[..., 0], device=device)
             n_targets = len(class_target)
 
@@ -280,13 +300,8 @@ def forward(
                     kpt_target.to(device),
                     area.to(device),
                 )
-
-                sub_losses["kpt_regression"] += (
-                    kpt_sublosses["regression"] * self.keypoint_regression_loss_weight
-                )
-                sub_losses["kpt_visibility"] += (
-                    kpt_sublosses["visibility"] * self.kpt_visibility_weight
-                )
+                for name, kpt_subloss in kpt_sublosses.items():
+                    sub_losses[name] += kpt_subloss
 
                 obj_targets[index] = (self.min_objectness_iou) + (
                     1 - self.min_objectness_iou
@@ -295,11 +310,10 @@ def forward(
                 if self.n_classes > 1:
                     sub_losses["class"] += (
                         self.class_loss.forward(
-                            [
-                                pred_subset[
-                                    :,
-                                    self.box_offset : self.box_offset + self.n_classes,
-                                ]
+                            pred_subset[
+                                :,
+                                self.box_offset : self.box_offset
+                                + self.n_classes,
                             ],
                             class_target,
                         )
@@ -315,7 +329,9 @@ def forward(
         loss = cast(Tensor, sum(sub_losses.values())).reshape([])
         return loss, {name: loss.detach() for name, loss in sub_losses.items()}
 
-    def _create_keypoint_target(self, scaled_targets: Tensor, box_xy_deltas: Tensor):
+    def _create_keypoint_target(
+        self, scaled_targets: Tensor, box_xy_deltas: Tensor
+    ):
         keypoint_target = scaled_targets[:, self.box_offset + 1 : -1]
         for j in range(self.n_keypoints):
             idx = 3 * j
diff --git a/luxonis_train/attached_modules/losses/keypoint_loss.py b/luxonis_train/attached_modules/losses/keypoint_loss.py
index d5ca278f..c17ac7a1 100644
--- a/luxonis_train/attached_modules/losses/keypoint_loss.py
+++ b/luxonis_train/attached_modules/losses/keypoint_loss.py
@@ -1,17 +1,20 @@
+from typing import Any
+
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor
 
-from luxonis_train.attached_modules.metrics.object_keypoint_similarity import (
-    get_area_factor,
+from luxonis_train.utils import (
     get_sigmas,
+    get_with_default,
+    process_keypoints_predictions,
 )
-from luxonis_train.utils.boxutils import process_keypoints_predictions
-from luxonis_train.utils.types import Labels, LabelType, Packet
 
 from .base_loss import BaseLoss
 from .bce_with_logits import BCEWithLogitsLoss
 
 
+# TODO: Make it work on its own
 class KeypointLoss(BaseLoss[Tensor, Tensor]):
     supported_labels = [LabelType.KEYPOINTS]
 
@@ -21,73 +24,89 @@ def __init__(
         bce_power: float = 1.0,
         sigmas: list[float] | None = None,
         area_factor: float | None = None,
-        **kwargs,
+        regression_loss_weight: float = 1.0,
+        visibility_loss_weight: float = 1.0,
+        **kwargs: Any,
     ):
-        """Keypoint based loss that is computed from OKS-based regression and visibility
-        loss.
+        """Keypoint based loss that is computed from OKS-based
+        regression and visibility loss.
 
         @type n_keypoints: int
         @param n_keypoints: Number of keypoints.
         @type bce_power: float
-        @param bce_power: Power used for BCE visibility loss. Defaults to C{1.0}.
-        @param sigmas: Sigmas used for OKS. If None then use COCO ones if possible or
-            default ones. Defaults to C{None}.
+        @param bce_power: Power used for BCE visibility loss. Defaults
+            to C{1.0}.
+        @param sigmas: Sigmas used for OKS. If None then use COCO ones
+            if possible or default ones. Defaults to C{None}.
         @type area_factor: float | None
-        @param area_factor: Factor by which we multiply bbox area. If None then use
-            default one. Defaults to C{None}.
+        @param area_factor: Factor by which we multiply bbox area. If
+            None then use default one. Defaults to C{None}.
+        @type regression_loss_weight: float
+        @param regression_loss_weight: Weight of regression loss.
+            Defaults to C{1.0}.
+        @type visibility_loss_weight: float
+        @param visibility_loss_weight: Weight of visibility loss.
+            Defaults to C{1.0}.
         """
 
         super().__init__(**kwargs)
         self.b_cross_entropy = BCEWithLogitsLoss(
             pos_weight=torch.tensor([bce_power]), **kwargs
         )
-        self.sigmas = get_sigmas(
-            sigmas=sigmas, n_keypoints=n_keypoints, class_name=self.name
+        self.sigmas = get_sigmas(sigmas, n_keypoints, caller_name=self.name)
+        self.area_factor = get_with_default(
+            area_factor, "bbox area scaling", self.name, default=0.53
         )
-        self.area_factor = get_area_factor(area_factor, class_name=self.name)
-
-    def prepare(self, inputs: Packet[Tensor], labels: Labels) -> tuple[Tensor, Tensor]:
-        return torch.cat(inputs["keypoints"], dim=0), self.get_label(labels)[0]
+        self.regression_loss_weight = regression_loss_weight
+        self.visibility_loss_weight = visibility_loss_weight
 
     def forward(
         self, prediction: Tensor, target: Tensor, area: Tensor
     ) -> tuple[Tensor, dict[str, Tensor]]:
-        """Computes the keypoint loss and visibility loss for a given prediction and
-        target.
+        """Computes the keypoint loss and visibility loss for a given
+        prediction and target.
 
         @type prediction: Tensor
-        @param prediction: Predicted tensor of shape C{[n_detections, n_keypoints * 3]}.
+        @param prediction: Predicted tensor of shape C{[n_detections,
+            n_keypoints * 3]}.
         @type target: Tensor
-        @param target: Target tensor of shape C{[n_detections, n_keypoints * 3]}.
+        @param target: Target tensor of shape C{[n_detections,
+            n_keypoints * 3]}.
         @type area: Tensor
         @param area: Area tensor of shape C{[n_detections]}.
         @rtype: tuple[Tensor, dict[str, Tensor]]
-        @return: A tuple containing the total loss tensor of shape C{[1,]} and a
-            dictionary with the regression loss and visibility loss tensors.
+        @return: A tuple containing the total loss tensor of shape
+            C{[1,]} and a dictionary with the regression loss and
+            visibility loss tensors.
         """
-        device = prediction.device
-        sigmas = self.sigmas.to(device)
+        sigmas = self.sigmas.to(prediction.device)
 
         pred_x, pred_y, pred_v = process_keypoints_predictions(prediction)
-        gt_x = target[:, 0::3]
-        gt_y = target[:, 1::3]
-        gt_v = (target[:, 2::3] > 0).float()
+        target_x = target[:, 0::3]
+        target_y = target[:, 1::3]
+        target_visibility = (target[:, 2::3] > 0).float()
 
-        visibility_loss = self.b_cross_entropy.forward(pred_v, gt_v)
+        visibility_loss = (
+            self.b_cross_entropy.forward(pred_v, target_visibility)
+            * self.visibility_loss_weight
+        )
         scales = area * self.area_factor
 
-        d = (gt_x - pred_x) ** 2 + (gt_y - pred_y) ** 2
-        e = d / (2 * sigmas**2) / (scales.view(-1, 1) + 1e-9) / 2
+        distance = (target_x - pred_x) ** 2 + (target_y - pred_y) ** 2
+        normalized_distance = (
+            distance / (2 * sigmas**2) / (scales.view(-1, 1) + 1e-9) / 2
+        )
 
-        regression_loss_unreduced = 1 - torch.exp(-e)
-        regression_loss_reduced = (regression_loss_unreduced * gt_v).sum(dim=1) / (
-            gt_v.sum(dim=1) + 1e-9
+        regression_loss = 1 - torch.exp(-normalized_distance)
+        regression_loss = (regression_loss * target_visibility).sum(dim=1) / (
+            target_visibility.sum(dim=1) + 1e-9
         )
-        regression_loss = regression_loss_reduced.mean()
+        regression_loss = regression_loss.mean()
+        regression_loss *= self.regression_loss_weight
 
         total_loss = regression_loss + visibility_loss
 
         return total_loss, {
-            "regression": regression_loss,
-            "visibility": visibility_loss,
+            "kpt_regression": regression_loss,
+            "kpt_visibility": visibility_loss,
         }
diff --git a/luxonis_train/attached_modules/losses/sigmoid_focal_loss.py b/luxonis_train/attached_modules/losses/sigmoid_focal_loss.py
index f3affc74..884d4863 100644
--- a/luxonis_train/attached_modules/losses/sigmoid_focal_loss.py
+++ b/luxonis_train/attached_modules/losses/sigmoid_focal_loss.py
@@ -1,4 +1,4 @@
-from typing import Literal
+from typing import Any, Literal
 
 from luxonis_ml.data import LabelType
 from torch import Tensor
@@ -15,7 +15,7 @@ def __init__(
         alpha: float = 0.25,
         gamma: float = 2.0,
         reduction: Literal["none", "mean", "sum"] = "mean",
-        **kwargs,
+        **kwargs: Any,
     ):
         """Focal loss from U{Focal Loss for Dense Object Detection
         <https://arxiv.org/abs/1708.02002>}.
@@ -37,7 +37,11 @@ def __init__(
 
     def forward(self, preds: Tensor, target: Tensor) -> Tensor:
         loss = sigmoid_focal_loss(
-            preds, target, alpha=self.alpha, gamma=self.gamma, reduction=self.reduction
+            preds,
+            target,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            reduction=self.reduction,
         )
 
         return loss
diff --git a/luxonis_train/attached_modules/losses/smooth_bce_with_logits.py b/luxonis_train/attached_modules/losses/smooth_bce_with_logits.py
index ac976428..edc2bf98 100644
--- a/luxonis_train/attached_modules/losses/smooth_bce_with_logits.py
+++ b/luxonis_train/attached_modules/losses/smooth_bce_with_logits.py
@@ -1,4 +1,4 @@
-from typing import Literal
+from typing import Any, Literal
 
 import torch
 from luxonis_ml.data import LabelType
@@ -17,31 +17,32 @@ def __init__(
         bce_pow: float = 1.0,
         weight: list[float] | None = None,
         reduction: Literal["mean", "sum", "none"] = "mean",
-        **kwargs,
+        **kwargs: Any,
     ):
         """BCE with logits loss and label smoothing.
 
         @type label_smoothing: float
-        @param label_smoothing: Label smoothing factor. Defaults to C{0.0}.
+        @param label_smoothing: Label smoothing factor. Defaults to
+            C{0.0}.
         @type bce_pow: float
         @param bce_pow: Weight for positive samples. Defaults to C{1.0}.
         @type weight: list[float] | None
-        @param weight: a manual rescaling weight given to the loss of each batch
-            element. If given, it has to be a list of length C{nbatch}.
+        @param weight: a manual rescaling weight given to the loss of
+            each batch element. If given, it has to be a list of length
+            C{nbatch}.
         @type reduction: Literal["mean", "sum", "none"]
-        @param reduction: Specifies the reduction to apply to the output: C{'none'} |
-            C{'mean'} | C{'sum'}. C{'none'}: no reduction will be applied, C{'mean'}:
-            the sum of the output will be divided by the number of elements in the
-            output, C{'sum'}: the output will be summed. Note: C{size_average} and
-            C{reduce} are in the process of being deprecated, and in the meantime,
-            specifying either of those two args will override C{reduction}. Defaults to
-            C{'mean'}.
-        @type kwargs: dict
-        @param kwargs: Additional arguments to pass to L{BaseLoss}.
+        @param reduction: Specifies the reduction to apply to the
+            output: C{'none'} | C{'mean'} | C{'sum'}. C{'none'}: no
+            reduction will be applied, C{'mean'}: the sum of the output
+            will be divided by the number of elements in the output,
+            C{'sum'}: the output will be summed. Note: C{size_average}
+            and C{reduce} are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will
+            override C{reduction}. Defaults to C{'mean'}.
         """
         super().__init__(**kwargs)
-        self.negative_smooth_const = 1.0 - 0.5 * label_smoothing
-        self.positive_smooth_const = 0.5 * label_smoothing
+        self.positive_smooth_const = 1.0 - label_smoothing
+        self.negative_smooth_const = label_smoothing
         self.criterion = BCEWithLogitsLoss(
             pos_weight=torch.tensor(
                 [bce_pow],
@@ -50,24 +51,26 @@ def __init__(
             reduction=reduction,
         )
 
-    def forward(self, predictions: list[Tensor], target: Tensor) -> Tensor:
+    def forward(self, predictions: Tensor, target: Tensor) -> Tensor:
         """Computes the BCE loss with label smoothing.
 
-        @type predictions: list[Tensor]
-        @param predictions: List of tensors of shape (N, n_classes), containing the
-            predicted class scores.
+        @type predictions: Tensor
+        @param predictions: Network predictions of shape (N, C, ...)
         @type target: Tensor
-        @param target: A tensor of shape (N,), containing the ground-truth class labels
+        @param target: A tensor of the same shape as predictions.
         @rtype: Tensor
         @return: A scalar tensor.
         """
-        prediction = predictions[0]
-        smoothed_target = torch.full_like(
-            prediction,
-            self.negative_smooth_const,
-            device=prediction.device,
-        )
-        smoothed_target[
-            torch.arange(target.shape[0]), target
-        ] = self.positive_smooth_const
-        return self.criterion.forward(prediction, smoothed_target)
+        if predictions.shape != target.shape:
+            raise RuntimeError(
+                f"Target tensor dimension ({target.shape}) and predictions tensor "
+                f"dimension ({predictions.shape}) should be the same."
+            )
+
+        if self.negative_smooth_const != 0.0:
+            target = (
+                target * self.positive_smooth_const
+                + (1 - target) * self.negative_smooth_const
+            )
+
+        return self.criterion(predictions, target)
diff --git a/luxonis_train/attached_modules/losses/softmax_focal_loss.py b/luxonis_train/attached_modules/losses/softmax_focal_loss.py
index 14f32e54..43c844f3 100644
--- a/luxonis_train/attached_modules/losses/softmax_focal_loss.py
+++ b/luxonis_train/attached_modules/losses/softmax_focal_loss.py
@@ -1,6 +1,5 @@
-# TODO: document
-
-from typing import Literal
+import logging
+from typing import Any, Literal
 
 import torch
 from luxonis_ml.data import LabelType
@@ -10,21 +9,26 @@
 
 from .cross_entropy import CrossEntropyLoss
 
+logger = logging.getLogger(__name__)
+
 
+# TODO: Add support for multi-class tasks
 class SoftmaxFocalLoss(BaseLoss[Tensor, Tensor]):
     supported_labels = [LabelType.SEGMENTATION, LabelType.CLASSIFICATION]
 
     def __init__(
         self,
-        alpha: float | list[float] = 0.25,
+        alpha: float = 0.25,
         gamma: float = 2.0,
         reduction: Literal["none", "mean", "sum"] = "mean",
-        **kwargs,
+        **kwargs: Any,
     ):
-        """Focal loss implementation for multi-class/multi-label tasks using Softmax.
+        """Focal loss implementation for binary classification and
+        segmentation tasks using Softmax.
 
-        @type alpha: float | list[float]
-        @param alpha: Weighting factor for the rare class. Defaults to C{0.25}.
+        @type alpha: float
+        @param alpha: Weighting factor for the rare class. Defaults to
+            C{0.25}.
         @type gamma: float
         @param gamma: Focusing parameter. Defaults to C{2.0}.
         @type reduction: Literal["none", "mean", "sum"]
@@ -40,13 +44,7 @@ def __init__(
     def forward(self, predictions: Tensor, target: Tensor) -> Tensor:
         ce_loss = self.ce_criterion.forward(predictions, target)
         pt = torch.exp(-ce_loss)
-        loss = ce_loss * ((1 - pt) ** self.gamma)
-
-        if isinstance(self.alpha, float) and self.alpha >= 0:
-            loss = self.alpha * loss
-        elif isinstance(self.alpha, list):
-            alpha_t = torch.tensor(self.alpha)[target]
-            loss = alpha_t * loss
+        loss = ce_loss * ((1 - pt) ** self.gamma) * self.alpha
 
         if self.reduction == "mean":
             loss = loss.mean()
diff --git a/luxonis_train/attached_modules/metrics/__init__.py b/luxonis_train/attached_modules/metrics/__init__.py
index 9e73e4ac..b1dc40ea 100644
--- a/luxonis_train/attached_modules/metrics/__init__.py
+++ b/luxonis_train/attached_modules/metrics/__init__.py
@@ -1,8 +1,8 @@
 from .base_metric import BaseMetric
-from .common import Accuracy, F1Score, JaccardIndex, Precision, Recall
 from .mean_average_precision import MeanAveragePrecision
 from .mean_average_precision_keypoints import MeanAveragePrecisionKeypoints
 from .object_keypoint_similarity import ObjectKeypointSimilarity
+from .torchmetrics import Accuracy, F1Score, JaccardIndex, Precision, Recall
 
 __all__ = [
     "Accuracy",
diff --git a/luxonis_train/attached_modules/metrics/base_metric.py b/luxonis_train/attached_modules/metrics/base_metric.py
index b2e456c9..a4109d2d 100644
--- a/luxonis_train/attached_modules/metrics/base_metric.py
+++ b/luxonis_train/attached_modules/metrics/base_metric.py
@@ -5,8 +5,8 @@
 from typing_extensions import TypeVarTuple, Unpack
 
 from luxonis_train.attached_modules import BaseAttachedModule
+from luxonis_train.utils import Labels, Packet
 from luxonis_train.utils.registry import METRICS
-from luxonis_train.utils.types import Labels, Packet
 
 Ts = TypeVarTuple("Ts")
 
@@ -19,8 +19,9 @@ class BaseMetric(
 ):
     """A base class for all metrics.
 
-    This class defines the basic interface for all metrics. It utilizes automatic
-    registration of defined subclasses to a L{METRICS} registry.
+    This class defines the basic interface for all metrics. It utilizes
+    automatic registration of defined subclasses to a L{METRICS}
+    registry.
     """
 
     @abstractmethod
@@ -33,7 +34,9 @@ def update(self, *args: Unpack[Ts]) -> None:
         ...
 
     @abstractmethod
-    def compute(self) -> Tensor | tuple[Tensor, dict[str, Tensor]] | dict[str, Tensor]:
+    def compute(
+        self,
+    ) -> Tensor | tuple[Tensor, dict[str, Tensor]] | dict[str, Tensor]:
         """Computes the metric.
 
         @rtype: Tensor | tuple[Tensor, dict[str, Tensor]] | dict[str, Tensor]
@@ -48,12 +51,14 @@ def compute(self) -> Tensor | tuple[Tensor, dict[str, Tensor]] | dict[str, Tenso
     def run_update(self, outputs: Packet[Tensor], labels: Labels) -> None:
         """Calls the metric's update method.
 
-        Validates and prepares the inputs, then calls the metric's update method.
+        Validates and prepares the inputs, then calls the metric's
+        update method.
 
         @type outputs: Packet[Tensor]
         @param outputs: The outputs of the model.
         @type labels: Labels
-        @param labels: The labels of the model. @raises L{IncompatibleException}: If the
-            inputs are not compatible with the module.
+        @param labels: The labels of the model. @raises
+            L{IncompatibleException}: If the inputs are not compatible
+            with the module.
         """
         self.update(*self.prepare(outputs, labels))
diff --git a/luxonis_train/attached_modules/metrics/common.py b/luxonis_train/attached_modules/metrics/common.py
deleted file mode 100644
index 97e8a7ec..00000000
--- a/luxonis_train/attached_modules/metrics/common.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import logging
-
-import torchmetrics
-from luxonis_ml.data import LabelType
-from torch import Tensor
-
-from .base_metric import BaseMetric
-
-logger = logging.getLogger(__name__)
-
-
-class TorchMetricWrapper(BaseMetric):
-    def __init__(self, **kwargs):
-        super().__init__(node=kwargs.pop("node", None))
-        task = kwargs.get("task")
-
-        if self.node.n_classes > 1:
-            if task == "binary":
-                raise ValueError(
-                    f"Task type set to '{task}', but the dataset has more than 1 class. "
-                    f"Set the `task` parameter for {self.name} to either 'multiclass' or 'multilabel'."
-                )
-            task = "multiclass"
-        else:
-            if task == "multiclass":
-                raise ValueError(
-                    f"Task type set to '{task}', but the dataset has only 1 class. "
-                    f"Set the `task` parameter for {self.name} to 'binary'."
-                )
-            task = "binary"
-        if "task" not in kwargs:
-            logger.warning(
-                f"Task type not specified for {self.name}, assuming '{task}'. "
-                "If this is not correct, please set the `task` parameter explicitly."
-            )
-        kwargs["task"] = task
-        self._task = task
-
-        if self._task == "multiclass":
-            if "num_classes" not in kwargs:
-                if self.node is None:
-                    raise ValueError(
-                        "Either `node` or `num_classes` must be provided to "
-                        "multiclass torchmetrics."
-                    )
-                kwargs["num_classes"] = self.node.n_classes
-        elif self._task == "multilabel":
-            if "num_labels" not in kwargs:
-                if self.node is None:
-                    raise ValueError(
-                        "Either `node` or `num_labels` must be provided to "
-                        "multilabel torchmetrics."
-                    )
-                kwargs["num_labels"] = self.node.n_classes
-
-        self.metric = self.Metric(**kwargs)
-
-    def update(self, preds, target, *args, **kwargs) -> None:
-        if self._task in ["multiclass"]:
-            target = target.argmax(dim=1)
-        self.metric.update(preds, target, *args, **kwargs)
-
-    def compute(self) -> Tensor:
-        return self.metric.compute()
-
-    def reset(self) -> None:
-        self.metric.reset()
-
-
-class Accuracy(TorchMetricWrapper):
-    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
-    Metric = torchmetrics.Accuracy
-
-
-class F1Score(TorchMetricWrapper):
-    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
-    Metric = torchmetrics.F1Score
-
-
-class JaccardIndex(TorchMetricWrapper):
-    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
-    Metric = torchmetrics.JaccardIndex
-
-
-class Precision(TorchMetricWrapper):
-    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
-    Metric = torchmetrics.Precision
-
-
-class Recall(TorchMetricWrapper):
-    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
-    Metric = torchmetrics.Recall
diff --git a/luxonis_train/attached_modules/metrics/mean_average_precision.py b/luxonis_train/attached_modules/metrics/mean_average_precision.py
index ffdf5e22..6d51f55b 100644
--- a/luxonis_train/attached_modules/metrics/mean_average_precision.py
+++ b/luxonis_train/attached_modules/metrics/mean_average_precision.py
@@ -1,23 +1,29 @@
+from typing import Any
+
 import torchmetrics.detection as detection
+from luxonis_ml.data import LabelType
 from torch import Tensor
 from torchvision.ops import box_convert
 
-from luxonis_train.utils.types import Labels, LabelType, Packet
+from luxonis_train.utils import Labels, Packet
 
 from .base_metric import BaseMetric
 
 
-class MeanAveragePrecision(BaseMetric):
-    """Compute the Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR) for object
-    detection predictions.
+class MeanAveragePrecision(
+    BaseMetric[list[dict[str, Tensor]], list[dict[str, Tensor]]]
+):
+    """Compute the Mean-Average-Precision (mAP) and Mean-Average-Recall
+    (mAR) for object detection predictions.
 
-    Adapted from U{Mean-Average-Precision (mAP) and Mean-Average-Recall (mAR)
+    Adapted from U{Mean-Average-Precision (mAP) and Mean-Average-Recall
+    (mAR)
     <https://lightning.ai/docs/torchmetrics/stable/detection/mean_average_precision.html>}.
     """
 
     supported_labels = [LabelType.BOUNDINGBOX]
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         self.metric = detection.MeanAveragePrecision()
 
@@ -29,12 +35,12 @@ def update(
         self.metric.update(outputs, labels)
 
     def prepare(
-        self, outputs: Packet[Tensor], labels: Labels
+        self, inputs: Packet[Tensor], labels: Labels
     ) -> tuple[list[dict[str, Tensor]], list[dict[str, Tensor]]]:
-        box_label = self.get_label(labels)[0]
-        output_nms = self.get_input_tensors(outputs)
+        box_label = self.get_label(labels)
+        output_nms = self.get_input_tensors(inputs)
 
-        image_size = self.node.original_in_shape[1:]
+        image_size = self.original_in_shape[1:]
 
         output_list: list[dict[str, Tensor]] = []
         label_list: list[dict[str, Tensor]] = []
@@ -51,7 +57,9 @@ def prepare(
             curr_bboxs = box_convert(curr_label[:, 2:], "xywh", "xyxy")
             curr_bboxs[:, 0::2] *= image_size[1]
             curr_bboxs[:, 1::2] *= image_size[0]
-            label_list.append({"boxes": curr_bboxs, "labels": curr_label[:, 1].int()})
+            label_list.append(
+                {"boxes": curr_bboxs, "labels": curr_label[:, 1].int()}
+            )
 
         return output_list, label_list
 
@@ -59,11 +67,21 @@ def reset(self) -> None:
         self.metric.reset()
 
     def compute(self) -> tuple[Tensor, dict[str, Tensor]]:
-        metric_dict = self.metric.compute()
+        metric_dict: dict[str, Tensor] = self.metric.compute()
 
         del metric_dict["classes"]
         del metric_dict["map_per_class"]
         del metric_dict["mar_100_per_class"]
+        for key in list(metric_dict.keys()):
+            if "map" in key:
+                map = metric_dict[key]
+                mar_key = key.replace("map", "mar")
+                if mar_key in metric_dict:
+                    mar = metric_dict[mar_key]
+                    metric_dict[key.replace("map", "f1")] = (
+                        2 * (map * mar) / (map + mar)
+                    )
+
         map = metric_dict.pop("map")
 
         return map, metric_dict
diff --git a/luxonis_train/attached_modules/metrics/mean_average_precision_keypoints.py b/luxonis_train/attached_modules/metrics/mean_average_precision_keypoints.py
index 0d558b43..3b34c242 100644
--- a/luxonis_train/attached_modules/metrics/mean_average_precision_keypoints.py
+++ b/luxonis_train/attached_modules/metrics/mean_average_precision_keypoints.py
@@ -3,21 +3,20 @@
 from typing import Any, Literal
 
 import torch
+from luxonis_ml.data import LabelType
 from pycocotools.coco import COCO
 from pycocotools.cocoeval import COCOeval
 from torch import Tensor
 from torchvision.ops import box_convert
 
-from luxonis_train.attached_modules.metrics.object_keypoint_similarity import (
-    get_area_factor,
-    get_sigmas,
-)
-from luxonis_train.utils.types import Labels, LabelType, Packet
+from luxonis_train.utils import Labels, Packet, get_sigmas, get_with_default
 
 from .base_metric import BaseMetric
 
 
-class MeanAveragePrecisionKeypoints(BaseMetric):
+class MeanAveragePrecisionKeypoints(
+    BaseMetric[list[dict[str, Tensor]], list[dict[str, Tensor]]]
+):
     """Mean Average Precision metric for keypoints.
 
     Uses C{OKS} as IoU measure.
@@ -48,15 +47,14 @@ def __init__(
         box_format: Literal["xyxy", "xywh", "cxcywh"] = "xyxy",
         **kwargs,
     ):
-        """Implementation of the mean average precision metric for keypoint detections.
+        """Implementation of the mean average precision metric for
+        keypoint detections.
 
         Adapted from: U{https://github.com/Lightning-AI/torchmetrics/blob/v1.0.1/src/
         torchmetrics/detection/mean_ap.py}.
 
-        @license: Apache-2.0 License
+        @license: Apache License, Version 2.0
 
-        @type num_keypoints: int
-        @param num_keypoints: Number of keypoints.
         @type sigmas: list[float] | None
         @param sigmas: Sigma for each keypoint to weigh its importance, if C{None}, then
             use COCO if possible otherwise defaults. Defaults to C{None}.
@@ -66,15 +64,15 @@ def __init__(
         @param max_dets: Maximum number of detections to be considered per image. Defaults to C{20}.
         @type box_format: Literal["xyxy", "xywh", "cxcywh"]
         @param box_format: Input bbox format.
-        @type kwargs: Any
-        @param kwargs: Additional arguments to pass to L{BaseMetric}.
         """
         super().__init__(**kwargs)
 
-        self.n_keypoints = self.node.n_keypoints
-
-        self.sigmas = get_sigmas(sigmas, self.n_keypoints, self.name)
-        self.area_factor = get_area_factor(area_factor, self.name)
+        self.sigmas = get_sigmas(
+            sigmas, self.n_keypoints, caller_name=self.name
+        )
+        self.area_factor = get_with_default(
+            area_factor, "bbox area scaling", self.name, default=0.53
+        )
         self.max_dets = max_dets
 
         allowed_box_formats = ("xyxy", "xywh", "cxcywh")
@@ -93,12 +91,16 @@ def __init__(
         self.add_state("groundtruth_labels", default=[], dist_reduce_fx=None)
         self.add_state("groundtruth_area", default=[], dist_reduce_fx=None)
         self.add_state("groundtruth_crowds", default=[], dist_reduce_fx=None)
-        self.add_state("groundtruth_keypoints", default=[], dist_reduce_fx=None)
+        self.add_state(
+            "groundtruth_keypoints", default=[], dist_reduce_fx=None
+        )
 
-    def prepare(self, outputs: Packet[Tensor], labels: Labels):
+    def prepare(
+        self, inputs: Packet[Tensor], labels: Labels
+    ) -> tuple[list[dict[str, Tensor]], list[dict[str, Tensor]]]:
         assert self.node.tasks is not None
-        kpts = self.get_label(labels, LabelType.KEYPOINTS)[0]
-        boxes = self.get_label(labels, LabelType.BOUNDINGBOX)[0]
+        kpts = self.get_label(labels, LabelType.KEYPOINTS)
+        boxes = self.get_label(labels, LabelType.BOUNDINGBOX)
 
         nkpts = (kpts.shape[1] - 2) // 3
         label = torch.zeros((len(boxes), nkpts * 3 + 6))
@@ -108,19 +110,21 @@ def prepare(self, outputs: Packet[Tensor], labels: Labels):
         label[:, 7::3] = kpts[:, 3::3]  # y
         label[:, 8::3] = kpts[:, 4::3]  # visiblity
 
-        output_list_kpt_map = []
-        label_list_kpt_map = []
-        image_size = self.node.original_in_shape[1:]
+        output_list_kpt_map: list[dict[str, Tensor]] = []
+        label_list_kpt_map: list[dict[str, Tensor]] = []
+        image_size = self.original_in_shape[1:]
 
-        output_kpts = self.get_input_tensors(outputs, LabelType.KEYPOINTS)
-        output_bboxes = self.get_input_tensors(outputs, LabelType.BOUNDINGBOX)
+        output_kpts = self.get_input_tensors(inputs, LabelType.KEYPOINTS)
+        output_bboxes = self.get_input_tensors(inputs, LabelType.BOUNDINGBOX)
         for i in range(len(output_kpts)):
             output_list_kpt_map.append(
                 {
                     "boxes": output_bboxes[i][:, :4],
                     "scores": output_bboxes[i][:, 4],
                     "labels": output_bboxes[i][:, 5].int(),
-                    "keypoints": output_kpts[i].reshape(-1, self.n_keypoints * 3),
+                    "keypoints": output_kpts[i].reshape(
+                        -1, self.n_keypoints * 3
+                    ),
                 }
             )
 
@@ -223,7 +227,9 @@ def compute(self) -> tuple[Tensor, dict[str, Tensor]]:
             coco_target.createIndex()
             coco_preds.createIndex()
 
-            self.coco_eval = COCOeval(coco_target, coco_preds, iouType="keypoints")
+            self.coco_eval = COCOeval(
+                coco_target, coco_preds, iouType="keypoints"
+            )
             self.coco_eval.params.kpt_oks_sigmas = self.sigmas.cpu().numpy()
             self.coco_eval.params.maxDets = [self.max_dets]
 
@@ -254,20 +260,24 @@ def _get_coco_format(
         crowds: list[Tensor] | None = None,
         area: list[Tensor] | None = None,
     ) -> dict[str, list[dict[str, Any]]]:
-        """Transforms and returns all cached targets or predictions in COCO format.
+        """Transforms and returns all cached targets or predictions in
+        COCO format.
 
-        Format is defined at U{https://cocodataset.org/#format-data}.
+        Format is defined at U{
+        https://cocodataset.org/#format-data}.
         """
-        images = []
-        annotations = []
-        annotation_id = 1  # has to start with 1, otherwise COCOEval results are wrong
+        images: list[dict[str, int]] = []
+        annotations: list[dict[str, Any]] = []
+        annotation_id = (
+            1  # has to start with 1, otherwise COCOEval results are wrong
+        )
 
         for image_id, (image_boxes, image_kpts, image_labels) in enumerate(
             zip(boxes, keypoints, labels)
         ):
-            image_boxes_list = image_boxes.cpu().tolist()
-            image_kpts_list = image_kpts.cpu().tolist()
-            image_labels_list = image_labels.cpu().tolist()
+            image_boxes_list: list[list[float]] = image_boxes.cpu().tolist()
+            image_kpts_list: list[list[float]] = image_kpts.cpu().tolist()
+            image_labels_list: list[int] = image_labels.cpu().tolist()
 
             images.append({"id": image_id})
 
@@ -297,8 +307,12 @@ def _get_coco_format(
                 else:
                     area_stat = image_box[2] * image_box[3] * self.area_factor
 
-                num_keypoints = len(
-                    [i for i in range(2, len(image_kpt), 3) if image_kpt[i] != 0]
+                n_keypoints = len(
+                    [
+                        i
+                        for i in range(2, len(image_kpt), 3)
+                        if image_kpt[i] != 0
+                    ]
                 )  # number of annotated keypoints
                 annotation = {
                     "id": annotation_id,
@@ -307,14 +321,18 @@ def _get_coco_format(
                     "area": area_stat,
                     "category_id": image_label,
                     "iscrowd": (
-                        crowds[image_id][k].cpu().tolist() if crowds is not None else 0
+                        crowds[image_id][k].cpu().tolist()
+                        if crowds is not None
+                        else 0
                     ),
                     "keypoints": image_kpt,
-                    "num_keypoints": num_keypoints,
+                    "num_keypoints": n_keypoints,
                 }
 
                 if scores is not None:
                     score = scores[image_id][k].cpu().tolist()
+                    # `tolist` returns a number for scalar tensors,
+                    # the name is misleading
                     if not isinstance(score, float):
                         raise ValueError(
                             f"Invalid input score of sample {image_id}, element {k}"
@@ -325,9 +343,15 @@ def _get_coco_format(
                 annotation_id += 1
 
         classes = [{"id": i, "name": str(i)} for i in self._get_classes()]
-        return {"images": images, "annotations": annotations, "categories": classes}
+        return {
+            "images": images,
+            "annotations": annotations,
+            "categories": classes,
+        }
 
-    def _get_safe_item_values(self, item: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
+    def _get_safe_item_values(
+        self, item: dict[str, Tensor]
+    ) -> tuple[Tensor, Tensor]:
         """Convert and return the boxes."""
         boxes = self._fix_empty_tensors(item["boxes"])
         if boxes.numel() > 0:
@@ -336,7 +360,8 @@ def _get_safe_item_values(self, item: dict[str, Tensor]) -> tuple[Tensor, Tensor
         return boxes, keypoints
 
     def _get_classes(self) -> list[int]:
-        """Return a list of unique classes found in ground truth and detection data."""
+        """Return a list of unique classes found in ground truth and
+        detection data."""
         if len(self.pred_labels) > 0 or len(self.groundtruth_labels) > 0:
             return (
                 torch.cat(self.pred_labels + self.groundtruth_labels)
@@ -348,7 +373,8 @@ def _get_classes(self) -> list[int]:
 
     @staticmethod
     def _fix_empty_tensors(input_tensor: Tensor) -> Tensor:
-        """Empty tensors can cause problems in DDP mode, this methods corrects them."""
+        """Empty tensors can cause problems in DDP mode, this methods
+        corrects them."""
         if input_tensor.numel() == 0 and input_tensor.ndim == 1:
             return input_tensor.unsqueeze(0)
         return input_tensor
diff --git a/luxonis_train/attached_modules/metrics/object_keypoint_similarity.py b/luxonis_train/attached_modules/metrics/object_keypoint_similarity.py
index 4cbd1cac..503a00ad 100644
--- a/luxonis_train/attached_modules/metrics/object_keypoint_similarity.py
+++ b/luxonis_train/attached_modules/metrics/object_keypoint_similarity.py
@@ -1,11 +1,13 @@
 import logging
+from typing import Any
 
 import torch
+from luxonis_ml.data import LabelType
 from scipy.optimize import linear_sum_assignment
 from torch import Tensor
 from torchvision.ops import box_convert
 
-from luxonis_train.utils.types import Labels, LabelType, Packet
+from luxonis_train.utils import Labels, Packet, get_sigmas, get_with_default
 
 from .base_metric import BaseMetric
 
@@ -33,46 +35,46 @@ def __init__(
         sigmas: list[float] | None = None,
         area_factor: float | None = None,
         use_cocoeval_oks: bool = True,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
-        """Object Keypoint Similarity metric for evaluating keypoint predictions.
+        """Object Keypoint Similarity metric for evaluating keypoint
+        predictions.
 
-        @type n_keypoints: int
-        @param n_keypoints: Number of keypoints.
         @type sigmas: list[float] | None
-        @param sigmas: Sigma for each keypoint to weigh its importance, if C{None}, then
-            use COCO if possible otherwise defaults. Defaults to C{None}.
+        @param sigmas: Sigma for each keypoint to weigh its importance,
+            if C{None}, then use COCO if possible otherwise defaults.
+            Defaults to C{None}.
         @type area_factor: float | None
-        @param area_factor: Factor by which we multiply bbox area. If None then use
-            default one. Defaults to C{None}.
+        @param area_factor: Factor by which we multiply bbox area. If
+            None then use default one. Defaults to C{None}.
         @type use_cocoeval_oks: bool
-        @param use_cocoeval_oks: Whether to use same OKS formula as in COCOeval or use
-            the one from definition. Defaults to C{True}.
+        @param use_cocoeval_oks: Whether to use same OKS formula as in
+            COCOeval or use the one from definition. Defaults to
+            C{True}.
         """
         super().__init__(**kwargs)
 
-        if n_keypoints is None and self.node is None:
-            raise ValueError(
-                f"Either `n_keypoints` or `node` must be provided to {self.name}."
-            )
-        self.n_keypoints = n_keypoints or self.node.n_keypoints
-
-        self.sigmas = get_sigmas(sigmas, self.n_keypoints, self.name)
-        self.area_factor = get_area_factor(area_factor, self.name)
+        self.sigmas = get_sigmas(
+            sigmas, self.n_keypoints, caller_name=self.name
+        )
+        self.area_factor = get_with_default(
+            area_factor, "bbox area scaling", self.name, default=0.53
+        )
         self.use_cocoeval_oks = use_cocoeval_oks
 
         self.add_state("pred_keypoints", default=[], dist_reduce_fx=None)
-        self.add_state("groundtruth_keypoints", default=[], dist_reduce_fx=None)
+        self.add_state(
+            "groundtruth_keypoints", default=[], dist_reduce_fx=None
+        )
         self.add_state("groundtruth_scales", default=[], dist_reduce_fx=None)
 
     def prepare(
-        self, outputs: Packet[Tensor], labels: Labels
+        self, inputs: Packet[Tensor], labels: Labels
     ) -> tuple[list[dict[str, Tensor]], list[dict[str, Tensor]]]:
-        assert self.node.tasks is not None
-        kpts_labels = self.get_label(labels, LabelType.KEYPOINTS)[0]
-        bbox_labels = self.get_label(labels, LabelType.BOUNDINGBOX)[0]
-        num_keypoints = (kpts_labels.shape[1] - 2) // 3
-        label = torch.zeros((len(bbox_labels), num_keypoints * 3 + 6))
+        kpts_labels = self.get_label(labels, LabelType.KEYPOINTS)
+        bbox_labels = self.get_label(labels, LabelType.BOUNDINGBOX)
+        n_keypoints = (kpts_labels.shape[1] - 2) // 3
+        label = torch.zeros((len(bbox_labels), n_keypoints * 3 + 6))
         label[:, :2] = bbox_labels[:, :2]
         label[:, 2:6] = box_convert(bbox_labels[:, 2:], "xywh", "xyxy")
         label[:, 6::3] = kpts_labels[:, 2::3]  # insert kp x coordinates
@@ -81,10 +83,10 @@ def prepare(
 
         output_list_oks = []
         label_list_oks = []
-        image_size = self.node.original_in_shape[1:]
+        image_size = self.original_in_shape[1:]
 
         for i, pred_kpt in enumerate(
-            self.get_input_tensors(outputs, LabelType.KEYPOINTS)
+            self.get_input_tensors(inputs, LabelType.KEYPOINTS)
         ):
             output_list_oks.append({"keypoints": pred_kpt})
 
@@ -97,8 +99,12 @@ def prepare(
             curr_kpts[:, 1::3] *= image_size[0]
             curr_bboxs_widths = curr_bboxs[:, 2] - curr_bboxs[:, 0]
             curr_bboxs_heights = curr_bboxs[:, 3] - curr_bboxs[:, 1]
-            curr_scales = curr_bboxs_widths * curr_bboxs_heights * self.area_factor
-            label_list_oks.append({"keypoints": curr_kpts, "scales": curr_scales})
+            curr_scales = (
+                curr_bboxs_widths * curr_bboxs_heights * self.area_factor
+            )
+            label_list_oks.append(
+                {"keypoints": curr_kpts, "scales": curr_scales}
+            )
 
         return output_list_oks, label_list_oks
 
@@ -129,11 +135,11 @@ def update(
                   width and height are unnormalized.
         """
         for item in preds:
-            keypoints = fix_empty_tensors(item["keypoints"])
+            keypoints = self._fix_empty_tensors(item["keypoints"])
             self.pred_keypoints.append(keypoints)
 
         for item in target:
-            keypoints = fix_empty_tensors(item["keypoints"])
+            keypoints = self._fix_empty_tensors(item["keypoints"])
             self.groundtruth_keypoints.append(keypoints)
             self.groundtruth_scales.append(item["scales"])
 
@@ -144,10 +150,14 @@ def compute(self) -> Tensor:
         image_mean_oks = torch.zeros(len(self.groundtruth_keypoints))
         for i, (pred_kpts, gt_kpts, gt_scales) in enumerate(
             zip(
-                self.pred_keypoints, self.groundtruth_keypoints, self.groundtruth_scales
+                self.pred_keypoints,
+                self.groundtruth_keypoints,
+                self.groundtruth_scales,
             )
         ):
-            gt_kpts = torch.reshape(gt_kpts, (-1, self.n_keypoints, 3))  # [N, K, 3]
+            gt_kpts = torch.reshape(
+                gt_kpts, (-1, self.n_keypoints, 3)
+            )  # [N, K, 3]
 
             image_ious = compute_oks(
                 pred_kpts,
@@ -159,13 +169,23 @@ def compute(self) -> Tensor:
             gt_indices, pred_indices = linear_sum_assignment(
                 image_ious.cpu().numpy(), maximize=True
             )
-            matched_ious = [image_ious[n, m] for n, m in zip(gt_indices, pred_indices)]
+            matched_ious = [
+                image_ious[n, m] for n, m in zip(gt_indices, pred_indices)
+            ]
             image_mean_oks[i] = torch.tensor(matched_ious).mean()
 
         final_oks = image_mean_oks.nanmean()
 
         return final_oks
 
+    @staticmethod
+    def _fix_empty_tensors(input_tensor: Tensor) -> Tensor:
+        """Empty tensors can cause problems in DDP mode, this methods
+        corrects them."""
+        if input_tensor.numel() == 0 and input_tensor.ndim == 1:
+            return input_tensor.unsqueeze(0)
+        return input_tensor
+
 
 def compute_oks(
     pred: Tensor,
@@ -174,7 +194,8 @@ def compute_oks(
     sigmas: Tensor,
     use_cocoeval_oks: bool,
 ) -> Tensor:
-    """Compute Object Keypoint Similarity between every GT and prediction.
+    """Compute Object Keypoint Similarity between every GT and
+    prediction.
 
     @type pred: Tensor[N, K, 3]
     @param pred: Predicted keypoints.
@@ -183,11 +204,11 @@ def compute_oks(
     @type scales: Tensor[M]
     @param scales: Scales of the bounding boxes.
     @type sigmas: Tensor
-    @param sigmas: Sigma for each keypoint to weigh its importance, if C{None}, then use
-        same weights for all.
+    @param sigmas: Sigma for each keypoint to weigh its importance, if
+        C{None}, then use same weights for all.
     @type use_cocoeval_oks: bool
-    @param use_cocoeval_oks: Whether to use same OKS formula as in COCOeval or use the
-        one from definition.
+    @param use_cocoeval_oks: Whether to use same OKS formula as in
+        COCOeval or use the one from definition.
     @rtype: Tensor
     @return: Object Keypoint Similarity every pred and gt [M, N]
     """
@@ -211,73 +232,3 @@ def compute_oks(
     return (torch.exp(-oks) * kpt_mask[:, None]).sum(-1) / (
         kpt_mask.sum(-1)[:, None] + eps
     )
-
-
-def fix_empty_tensors(input_tensor: Tensor) -> Tensor:
-    """Empty tensors can cause problems in DDP mode, this methods corrects them."""
-    if input_tensor.numel() == 0 and input_tensor.ndim == 1:
-        return input_tensor.unsqueeze(0)
-    return input_tensor
-
-
-def get_sigmas(
-    sigmas: list[float] | None, n_keypoints: int, class_name: str | None
-) -> Tensor:
-    """Validate and set the sigma values."""
-    if sigmas is not None:
-        if len(sigmas) == n_keypoints:
-            return torch.tensor(sigmas, dtype=torch.float32)
-        else:
-            error_msg = "The length of the sigmas list must be the same as the number of keypoints."
-            if class_name:
-                error_msg = f"[{class_name}] {error_msg}"
-            raise ValueError(error_msg)
-    else:
-        if n_keypoints == 17:
-            warn_msg = "Default COCO sigmas are being used."
-            if class_name:
-                warn_msg = f"[{class_name}] {warn_msg}"
-            logger.warning(warn_msg)
-            return torch.tensor(
-                [
-                    0.026,
-                    0.025,
-                    0.025,
-                    0.035,
-                    0.035,
-                    0.079,
-                    0.079,
-                    0.072,
-                    0.072,
-                    0.062,
-                    0.062,
-                    0.107,
-                    0.107,
-                    0.087,
-                    0.087,
-                    0.089,
-                    0.089,
-                ],
-                dtype=torch.float32,
-            )
-        else:
-            warn_msg = "Default sigma of 0.04 is being used for each keypoint."
-            if class_name:
-                warn_msg = f"[{class_name}] {warn_msg}"
-            logger.warning(warn_msg)
-            return torch.tensor([0.04] * n_keypoints, dtype=torch.float32)
-
-
-def get_area_factor(area_factor: float | None, class_name: str | None) -> float:
-    """Set the default area factor if not defined."""
-    factor = 0.53
-    if area_factor is None:
-        warn_msg = (
-            f"Default area_factor of {factor} is being used for bbox area scaling."
-        )
-        if class_name:
-            warn_msg = f"[{class_name}] {warn_msg}"
-        logger.warning(warn_msg)
-        return factor
-    else:
-        return area_factor
diff --git a/luxonis_train/attached_modules/metrics/torchmetrics.py b/luxonis_train/attached_modules/metrics/torchmetrics.py
new file mode 100644
index 00000000..a8797a13
--- /dev/null
+++ b/luxonis_train/attached_modules/metrics/torchmetrics.py
@@ -0,0 +1,114 @@
+import logging
+from contextlib import suppress
+from typing import Any
+
+import torchmetrics
+from luxonis_ml.data import LabelType
+from torch import Tensor
+
+from .base_metric import BaseMetric
+
+logger = logging.getLogger(__name__)
+
+
+class TorchMetricWrapper(BaseMetric[Tensor]):
+    Metric: type[torchmetrics.Metric]
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(node=kwargs.pop("node", None))
+        task = kwargs.get("task")
+        if task is None:
+            if "num_classes" in kwargs:
+                if kwargs["num_classes"] == 1:
+                    task = "binary"
+                else:
+                    task = "multiclass"
+            elif "num_labels" in kwargs:
+                task = "multilabel"
+            else:
+                with suppress(RuntimeError, ValueError):
+                    if self.n_classes == 1:
+                        task = "binary"
+                    else:
+                        task = "multiclass"
+
+        if task is None:
+            raise ValueError(
+                f"'{self.name}' does not have the 'task' parameter set. "
+                "and it is not possible to infer it from the other arguments. "
+                "You can either set the 'task' parameter explicitly, provide either 'num_classes' or 'num_labels' argument, "
+                "or use this metric with a node. "
+                "The 'task' can be one of 'binary', 'multiclass', or 'multilabel'. "
+            )
+        self._task = task
+        kwargs["task"] = task
+
+        n_classes: int | None = kwargs.get(
+            "num_classes", kwargs.get("num_labels")
+        )
+
+        if n_classes is None:
+            with suppress(RuntimeError, ValueError):
+                n_classes = self.n_classes
+
+        if n_classes is None and task != "binary":
+            arg_name = "num_classes" if task == "multiclass" else "num_labels"
+            raise ValueError(
+                f"'{self.name}' metric does not have the '{arg_name}' parameter set "
+                "and it is not possible to infer it from the other arguments. "
+                "You can either set the '{arg_name}' parameter explicitly, or use this metric with a node."
+            )
+
+        if task == "binary" and n_classes is not None and n_classes > 1:
+            raise ValueError(
+                f"Task type set to '{task}', but the dataset has more than 1 class. "
+                f"Set the `task` argument of '{self.name}' to either 'multiclass' or 'multilabel'."
+            )
+        elif task != "binary" and n_classes == 1:
+            raise ValueError(
+                f"Task type set to '{task}', but the dataset has only 1 class. "
+                f"Set the `task` argument of '{self.name}' to 'binary'."
+            )
+
+        if task == "multiclass":
+            kwargs["num_classes"] = n_classes
+        elif task == "multilabel":
+            kwargs["num_labels"] = n_classes
+
+        self.metric = self.Metric(**kwargs)
+
+    def update(self, preds: Tensor, target: Tensor) -> None:
+        if self._task in ["multiclass"]:
+            target = target.argmax(dim=1)
+        self.metric.update(preds, target)
+
+    def compute(self) -> Tensor:
+        return self.metric.compute()
+
+    def reset(self) -> None:
+        self.metric.reset()
+
+
+class Accuracy(TorchMetricWrapper):
+    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+    Metric = torchmetrics.Accuracy
+
+
+class F1Score(TorchMetricWrapper):
+    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+    Metric = torchmetrics.F1Score
+
+
+class JaccardIndex(TorchMetricWrapper):
+    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+    Metric = torchmetrics.JaccardIndex
+
+
+class Precision(TorchMetricWrapper):
+    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+    Metric = torchmetrics.Precision
+
+
+class Recall(TorchMetricWrapper):
+    supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+    Metric = torchmetrics.Recall
diff --git a/luxonis_train/attached_modules/visualizers/base_visualizer.py b/luxonis_train/attached_modules/visualizers/base_visualizer.py
index 5fa6db62..817a09d5 100644
--- a/luxonis_train/attached_modules/visualizers/base_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/base_visualizer.py
@@ -4,8 +4,8 @@
 from typing_extensions import TypeVarTuple, Unpack
 
 from luxonis_train.attached_modules import BaseAttachedModule
+from luxonis_train.utils import Labels, Packet
 from luxonis_train.utils.registry import VISUALIZERS
-from luxonis_train.utils.types import Labels, Packet
 
 Ts = TypeVarTuple("Ts")
 
@@ -17,8 +17,9 @@ class BaseVisualizer(
 ):
     """A base class for all visualizers.
 
-    This class defines the basic interface for all visualizers. It utilizes automatic
-    registration of defined subclasses to the L{VISUALIZERS} registry.
+    This class defines the basic interface for all visualizers. It
+    utilizes automatic registration of defined subclasses to the
+    L{VISUALIZERS} registry.
     """
 
     @abstractmethod
@@ -27,7 +28,12 @@ def forward(
         label_canvas: Tensor,
         prediction_canvas: Tensor,
         *args: Unpack[Ts],
-    ) -> Tensor | tuple[Tensor, Tensor] | tuple[Tensor, list[Tensor]] | list[Tensor]:
+    ) -> (
+        Tensor
+        | tuple[Tensor, Tensor]
+        | tuple[Tensor, list[Tensor]]
+        | list[Tensor]
+    ):
         """Forward pass of the visualizer.
 
         Takes an image and the prepared inputs from the `prepare` method and
@@ -62,4 +68,6 @@ def run(
         inputs: Packet[Tensor],
         labels: Labels,
     ) -> Tensor | tuple[Tensor, Tensor] | tuple[Tensor, list[Tensor]]:
-        return self(label_canvas, prediction_canvas, *self.prepare(inputs, labels))
+        return self(
+            label_canvas, prediction_canvas, *self.prepare(inputs, labels)
+        )
diff --git a/luxonis_train/attached_modules/visualizers/bbox_visualizer.py b/luxonis_train/attached_modules/visualizers/bbox_visualizer.py
index df3ac933..e544bf06 100644
--- a/luxonis_train/attached_modules/visualizers/bbox_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/bbox_visualizer.py
@@ -1,12 +1,16 @@
 import logging
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor
 
-from luxonis_train.utils.types import LabelType
-
 from .base_visualizer import BaseVisualizer
-from .utils import Color, draw_bounding_box_labels, draw_bounding_boxes, get_color
+from .utils import (
+    Color,
+    draw_bounding_box_labels,
+    draw_bounding_boxes,
+    get_color,
+)
 
 
 class BBoxVisualizer(BaseVisualizer[list[Tensor], Tensor]):
@@ -25,39 +29,50 @@ def __init__(
     ):
         """Visualizer for bounding box predictions.
 
-        Creates a visualization of the bounding box predictions and labels.
+        Creates a visualization of the bounding box predictions and
+        labels.
 
         @type labels: dict[int, str] | list[str] | None
-        @param labels: Either a dictionary mapping class indices to names, or a list of
-            names. If list is provided, the label mapping is done by index. By default,
-            no labels are drawn.
+        @param labels: Either a dictionary mapping class indices to
+            names, or a list of names. If list is provided, the label
+            mapping is done by index. By default, no labels are drawn.
         @type draw_labels: bool
-        @param draw_labels: Whether or not to draw labels. Defaults to C{True}.
+        @param draw_labels: Whether or not to draw labels. Defaults to
+            C{True}.
         @type colors: dict[int, Color] | list[Color] | None
-        @param colors: Either a dictionary mapping class indices to colors, or a list of
-            colors. If list is provided, the color mapping is done by index. By default,
-            random colors are used.
+        @param colors: Either a dictionary mapping class indices to
+            colors, or a list of colors. If list is provided, the color
+            mapping is done by index. By default, random colors are
+            used.
         @type fill: bool
-        @param fill: Whether or not to fill the bounding boxes. Defaults to C{False}.
+        @param fill: Whether or not to fill the bounding boxes. Defaults
+            to C{False}.
         @type width: int | None
-        @param width: The width of the bounding box lines. Defaults to C{1}.
+        @param width: The width of the bounding box lines. Defaults to
+            C{1}.
         @type font: str | None
-        @param font: A filename containing a TrueType font. Defaults to C{None}.
+        @param font: A filename containing a TrueType font. Defaults to
+            C{None}.
         @type font_size: int | None
-        @param font_size: The font size to use for the labels. Defaults to C{None}.
+        @param font_size: The font size to use for the labels. Defaults
+            to C{None}.
         """
         super().__init__(**kwargs)
         if isinstance(labels, list):
             labels = {i: label for i, label in enumerate(labels)}
 
         self.bbox_labels = labels or {
-            i: label for i, label in enumerate(self.node.class_names)
+            i: label for i, label in enumerate(self.class_names)
         }
 
         if colors is None:
-            colors = {label: get_color(i) for i, label in self.bbox_labels.items()}
+            colors = {
+                label: get_color(i) for i, label in self.bbox_labels.items()
+            }
         if isinstance(colors, list):
-            colors = {self.bbox_labels[i]: color for i, color in enumerate(colors)}
+            colors = {
+                self.bbox_labels[i]: color for i, color in enumerate(colors)
+            }
         self.colors = colors
         self.fill = fill
         self.width = width
@@ -159,16 +174,17 @@ def forward(
         predictions: list[Tensor],
         targets: Tensor,
     ) -> tuple[Tensor, Tensor]:
-        """Creates a visualization of the bounding box predictions and labels.
+        """Creates a visualization of the bounding box predictions and
+        labels.
 
         @type label_canvas: Tensor
         @param label_canvas: The canvas containing the labels.
         @type prediction_canvas: Tensor
         @param prediction_canvas: The canvas containing the predictions.
         @type prediction: Tensor
-        @param prediction: The predicted bounding boxes. The shape should be [N, 6],
-            where N is the number of bounding boxes and the last dimension is [x1, y1,
-            x2, y2, class, conf].
+        @param prediction: The predicted bounding boxes. The shape
+            should be [N, 6], where N is the number of bounding boxes
+            and the last dimension is [x1, y1, x2, y2, class, conf].
         @type targets: Tensor
         @param targets: The target bounding boxes.
         """
diff --git a/luxonis_train/attached_modules/visualizers/classification_visualizer.py b/luxonis_train/attached_modules/visualizers/classification_visualizer.py
index 20a5710e..9d26172b 100644
--- a/luxonis_train/attached_modules/visualizers/classification_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/classification_visualizer.py
@@ -23,8 +23,8 @@ def __init__(
         """Visualizer for classification tasks.
 
         @type include_plot: bool
-        @param include_plot: Whether to include a plot of the class probabilities in the
-            visualization. Defaults to C{True}.
+        @param include_plot: Whether to include a plot of the class
+            probabilities in the visualization. Defaults to C{True}.
         """
         super().__init__(**kwargs)
         self.include_plot = include_plot
@@ -34,19 +34,21 @@ def __init__(
 
     def _get_class_name(self, pred: Tensor) -> str:
         idx = int((pred.argmax()).item())
-        if self.node.class_names is None:
+        if self.class_names is None:
             return str(idx)
-        return self.node.class_names[idx]
+        return self.class_names[idx]
 
-    def _generate_plot(self, prediction: Tensor, width: int, height: int) -> Tensor:
-        prediction = prediction.softmax(-1).detach().cpu().numpy()
+    def _generate_plot(
+        self, prediction: Tensor, width: int, height: int
+    ) -> Tensor:
+        pred = prediction.softmax(-1).detach().cpu().numpy()
         fig, ax = plt.subplots(figsize=(width / 100, height / 100))
-        ax.bar(np.arange(len(prediction)), prediction)
-        ax.set_xticks(np.arange(len(prediction)))
-        if self.node.class_names is not None:
-            ax.set_xticklabels(self.node.class_names, rotation=90)
+        ax.bar(np.arange(len(pred)), pred)
+        ax.set_xticks(np.arange(len(pred)))
+        if self.class_names is not None:
+            ax.set_xticklabels(self.class_names, rotation=90)
         else:
-            ax.set_xticklabels(np.arange(1, len(prediction) + 1))
+            ax.set_xticklabels(np.arange(1, len(pred) + 1))
         ax.set_ylim(0, 1)
         ax.set_xlabel("Class")
         ax.set_ylabel("Probability")
@@ -88,7 +90,9 @@ def forward(
             overlay[i] = numpy_to_torch_img(arr)
             if self.include_plot:
                 plots[i] = self._generate_plot(
-                    prediction, prediction_canvas.shape[3], prediction_canvas.shape[2]
+                    prediction,
+                    prediction_canvas.shape[3],
+                    prediction_canvas.shape[2],
                 )
 
         if self.include_plot:
diff --git a/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py b/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
index 287d5e1c..53b9cb88 100644
--- a/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/keypoint_visualizer.py
@@ -22,18 +22,20 @@ def __init__(
         """Visualizer for keypoints.
 
         @type visibility_threshold: float
-        @param visibility_threshold: Threshold for visibility of keypoints. If the
-            visibility of a keypoint is below this threshold, it is considered as not
-            visible. Defaults to C{0.5}.
+        @param visibility_threshold: Threshold for visibility of
+            keypoints. If the visibility of a keypoint is below this
+            threshold, it is considered as not visible. Defaults to
+            C{0.5}.
         @type connectivity: list[tuple[int, int]] | None
-        @param connectivity: List of tuples of keypoint indices that define the
-            connections in the skeleton. Defaults to C{None}.
+        @param connectivity: List of tuples of keypoint indices that
+            define the connections in the skeleton. Defaults to C{None}.
         @type visible_color: L{Color}
-        @param visible_color: Color of visible keypoints. Either a string or a tuple of
-            RGB values. Defaults to C{"red"}.
+        @param visible_color: Color of visible keypoints. Either a
+            string or a tuple of RGB values. Defaults to C{"red"}.
         @type nonvisible_color: L{Color} | None
-        @param nonvisible_color: Color of nonvisible keypoints. If C{None}, nonvisible
-            keypoints are not drawn. Defaults to C{None}.
+        @param nonvisible_color: Color of nonvisible keypoints. If
+            C{None}, nonvisible keypoints are not drawn. Defaults to
+            C{None}.
         """
         super().__init__(**kwargs)
         self.visibility_threshold = visibility_threshold
@@ -62,7 +64,9 @@ def draw_predictions(
             if nonvisible_color is not None:
                 _kwargs = deepcopy(kwargs)
                 _kwargs["colors"] = nonvisible_color
-                nonvisible_kpts = prediction[..., :2] * mask.unsqueeze(-1).float()
+                nonvisible_kpts = (
+                    prediction[..., :2] * mask.unsqueeze(-1).float()
+                )
                 viz[i] = draw_keypoints(
                     viz[i].clone(),
                     nonvisible_kpts[..., :2],
diff --git a/luxonis_train/attached_modules/visualizers/multi_visualizer.py b/luxonis_train/attached_modules/visualizers/multi_visualizer.py
index c7925ecc..b7ecbfbb 100644
--- a/luxonis_train/attached_modules/visualizers/multi_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/multi_visualizer.py
@@ -7,7 +7,8 @@
 
 
 class MultiVisualizer(BaseVisualizer[Packet[Tensor], Labels]):
-    """Special type of visualizer that combines multiple visualizers together.
+    """Special type of visualizer that combines multiple visualizers
+    together.
 
     All the visualizers are applied in the order they are provided and they all draw on
     the same canvas.
@@ -25,14 +26,16 @@ def __init__(self, visualizers: list[Kwargs], **kwargs):
         self.visualizers = []
         for item in visualizers:
             visualizer_params = item.get("params", {})
-            visualizer = VISUALIZERS.get(item["name"])(**visualizer_params, **kwargs)
+            visualizer = VISUALIZERS.get(item["name"])(
+                **visualizer_params, **kwargs
+            )
             self.visualizers.append(visualizer)
 
     def prepare(
-        self, output: Packet[Tensor], label: Labels, idx: int = 0
+        self, inputs: Packet[Tensor], label: Labels, idx: int = 0
     ) -> tuple[Packet[Tensor], Labels]:
         self._idx = idx
-        return output, label
+        return inputs, label
 
     def forward(
         self,
@@ -42,12 +45,16 @@ def forward(
         labels: Labels,
     ) -> tuple[Tensor, Tensor]:
         for visualizer in self.visualizers:
-            match visualizer.run(label_canvas, prediction_canvas, outputs, labels):
+            match visualizer.run(
+                label_canvas, prediction_canvas, outputs, labels
+            ):
                 case Tensor() as prediction_viz:
                     prediction_canvas = prediction_viz
                 case (Tensor(data=label_viz), Tensor(data=prediction_viz)):
                     label_canvas = label_viz
                     prediction_canvas = prediction_viz
                 case _:
-                    raise NotImplementedError
+                    raise NotImplementedError(
+                        "Unexpected return type from visualizer."
+                    )
         return label_canvas, prediction_canvas
diff --git a/luxonis_train/attached_modules/visualizers/segmentation_visualizer.py b/luxonis_train/attached_modules/visualizers/segmentation_visualizer.py
index 85b93ce1..15e2fd09 100644
--- a/luxonis_train/attached_modules/visualizers/segmentation_visualizer.py
+++ b/luxonis_train/attached_modules/visualizers/segmentation_visualizer.py
@@ -1,12 +1,16 @@
 import logging
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor
 
-from luxonis_train.utils.types import LabelType
-
 from .base_visualizer import BaseVisualizer
-from .utils import Color, draw_segmentation_labels, get_color, seg_output_to_bool
+from .utils import (
+    Color,
+    draw_segmentation_labels,
+    get_color,
+    seg_output_to_bool,
+)
 
 logger = logging.getLogger(__name__)
 log_disable = False
@@ -98,7 +102,8 @@ def forward(
         targets: Tensor,
         **kwargs,
     ) -> tuple[Tensor, Tensor]:
-        """Creates a visualization of the segmentation predictions and labels.
+        """Creates a visualization of the segmentation predictions and
+        labels.
 
         @type label_canvas: Tensor
         @param label_canvas: The canvas to draw the labels on.
@@ -146,7 +151,9 @@ def _adjust_colors(
 
         if not log_disable:
             if colors is None:
-                logger.warning("No colors provided. Using random colors instead.")
+                logger.warning(
+                    "No colors provided. Using random colors instead."
+                )
             elif data.size(1) != len(colors):
                 logger.warning(
                     f"Number of colors ({len(colors)}) does not match number of "
diff --git a/luxonis_train/attached_modules/visualizers/utils.py b/luxonis_train/attached_modules/visualizers/utils.py
index c55b12ce..402ab98f 100644
--- a/luxonis_train/attached_modules/visualizers/utils.py
+++ b/luxonis_train/attached_modules/visualizers/utils.py
@@ -19,7 +19,7 @@
     draw_segmentation_masks,
 )
 
-from luxonis_train.utils.config import Config
+from luxonis_train.utils import Config
 
 Color = str | tuple[int, int, int]
 """Color type alias.
@@ -44,13 +44,14 @@ def figure_to_torch(fig: Figure, width: int, height: int) -> Tensor:
 def torch_img_to_numpy(
     img: Tensor, reverse_colors: bool = False
 ) -> npt.NDArray[np.uint8]:
-    """Converts a torch image (CHW) to a numpy array (HWC). Optionally also converts
-    colors.
+    """Converts a torch image (CHW) to a numpy array (HWC). Optionally
+    also converts colors.
 
     @type img: Tensor
     @param img: Torch image (CHW)
     @type reverse_colors: bool
-    @param reverse_colors: Whether to reverse colors (RGB to BGR). Defaults to False.
+    @param reverse_colors: Whether to reverse colors (RGB to BGR).
+        Defaults to False.
     @rtype: npt.NDArray[np.uint8]
     @return: Numpy image (HWC)
     """
@@ -129,8 +130,8 @@ def draw_bounding_box_labels(img: Tensor, label: Tensor, **kwargs) -> Tensor:
     @type img: Tensor
     @param img: Image to draw on.
     @type label: Tensor
-    @param label: Bounding box label. The shape should be (n_instances, 4), where the
-        last dimension is (x, y, w, h).
+    @param label: Bounding box label. The shape should be (n_instances,
+        4), where the last dimension is (x, y, w, h).
     @type kwargs: dict
     @param kwargs: Additional arguments to pass to
         L{torchvision.utils.draw_bounding_boxes}.
@@ -150,10 +151,11 @@ def draw_keypoint_labels(img: Tensor, label: Tensor, **kwargs) -> Tensor:
     @type img: Tensor
     @param img: Image to draw on.
     @type label: Tensor
-    @param label: Keypoint label. The shape should be (n_instances, 3), where the last
-        dimension is (x, y, visibility).
+    @param label: Keypoint label. The shape should be (n_instances, 3),
+        where the last dimension is (x, y, visibility).
     @type kwargs: dict
-    @param kwargs: Additional arguments to pass to L{torchvision.utils.draw_keypoints}.
+    @param kwargs: Additional arguments to pass to
+        L{torchvision.utils.draw_keypoints}.
     @rtype: Tensor
     @return: Image with keypoint labels drawn on.
     """
@@ -191,7 +193,8 @@ def unnormalize(
     std: list[float] | float | None = None,
     to_uint8: bool = False,
 ) -> Tensor:
-    """Unnormalizes an image back to original values, optionally converts it to uint8.
+    """Unnormalizes an image back to original values, optionally
+    converts it to uint8.
 
     @type img: Tensor
     @param img: Image to unnormalize.
@@ -304,9 +307,12 @@ def get_color(seed: int) -> Color:
 #
 #  TEST:
 def combine_visualizations(
-    visualization: Tensor | tuple[Tensor, Tensor] | tuple[Tensor, list[Tensor]],
+    visualization: Tensor
+    | tuple[Tensor, Tensor]
+    | tuple[Tensor, list[Tensor]],
 ) -> Tensor:
-    """Default way of combining multiple visualizations into one final image."""
+    """Default way of combining multiple visualizations into one final
+    image."""
 
     def resize_to_match(
         fst: Tensor,
@@ -315,7 +321,7 @@ def resize_to_match(
         keep_size: Literal["larger", "smaller", "first", "second"] = "larger",
         resize_along: Literal["width", "height", "exact"] = "height",
         keep_aspect_ratio: bool = True,
-    ):
+    ) -> tuple[Tensor, Tensor]:
         """Resizes two images so they have the same size.
 
         Resizes two images so they can be concateneted together. It's possible to
@@ -411,7 +417,9 @@ def resize_to_match(
         case Tensor() as viz:
             return viz
         case (Tensor(data=viz_labels), Tensor(data=viz_predictions)):
-            viz_labels, viz_predictions = resize_to_match(viz_labels, viz_predictions)
+            viz_labels, viz_predictions = resize_to_match(
+                viz_labels, viz_predictions
+            )
             return torch.cat([viz_labels, viz_predictions], dim=-1)
 
         case (Tensor(data=_), [*viz]) if isinstance(viz, list) and all(
diff --git a/luxonis_train/callbacks/__init__.py b/luxonis_train/callbacks/__init__.py
index 4c7f7824..95f860a1 100644
--- a/luxonis_train/callbacks/__init__.py
+++ b/luxonis_train/callbacks/__init__.py
@@ -1,9 +1,13 @@
 from lightning.pytorch.callbacks import (
     DeviceStatsMonitor,
     EarlyStopping,
+    GradientAccumulationScheduler,
     LearningRateMonitor,
     ModelCheckpoint,
+    ModelPruning,
     RichModelSummary,
+    StochasticWeightAveraging,
+    Timer,
 )
 
 from luxonis_train.utils.registry import CALLBACKS
@@ -26,6 +30,10 @@
 CALLBACKS.register_module(module=ModelCheckpoint)
 CALLBACKS.register_module(module=RichModelSummary)
 CALLBACKS.register_module(module=DeviceStatsMonitor)
+CALLBACKS.register_module(module=GradientAccumulationScheduler)
+CALLBACKS.register_module(module=StochasticWeightAveraging)
+CALLBACKS.register_module(module=Timer)
+CALLBACKS.register_module(module=ModelPruning)
 
 
 __all__ = [
diff --git a/luxonis_train/callbacks/archive_on_train_end.py b/luxonis_train/callbacks/archive_on_train_end.py
index d9e7b298..30949e4e 100644
--- a/luxonis_train/callbacks/archive_on_train_end.py
+++ b/luxonis_train/callbacks/archive_on_train_end.py
@@ -26,12 +26,12 @@ def on_train_end(
         """
 
         path = self.get_checkpoint(pl_module)
-        if path is None:
+        if path is None:  # pragma: no cover
             logger.warning("Skipping model archiving.")
             return
 
         onnx_path = pl_module.core._exported_models.get("onnx")
-        if onnx_path is None:
+        if onnx_path is None:  # pragma: no cover
             logger.error(
                 "Model executable not found. "
                 "Make sure to run exporter callback before archiver callback. "
diff --git a/luxonis_train/callbacks/export_on_train_end.py b/luxonis_train/callbacks/export_on_train_end.py
index 261c4ef6..e727e81f 100644
--- a/luxonis_train/callbacks/export_on_train_end.py
+++ b/luxonis_train/callbacks/export_on_train_end.py
@@ -25,7 +25,7 @@ def on_train_end(
         @param pl_module: Pytorch Lightning module.
         """
         path = self.get_checkpoint(pl_module)
-        if path is None:
+        if path is None:  # pragma: no cover
             logger.warning("Skipping model export.")
             return
 
diff --git a/luxonis_train/callbacks/gpu_stats_monitor.py b/luxonis_train/callbacks/gpu_stats_monitor.py
index 9479d4d2..a189ed3f 100644
--- a/luxonis_train/callbacks/gpu_stats_monitor.py
+++ b/luxonis_train/callbacks/gpu_stats_monitor.py
@@ -27,11 +27,11 @@
 
 import pytorch_lightning as pl
 import torch
-from lightning.pytorch.accelerators import CUDAAccelerator  # type: ignore
-from pytorch_lightning.utilities import rank_zero_only
-from pytorch_lightning.utilities.exceptions import (
-    MisconfigurationException,  # type: ignore
+from lightning.pytorch.accelerators.cuda import CUDAAccelerator
+from lightning_fabric.utilities.exceptions import (
+    MisconfigurationException,  # noqa: F401
 )
+from pytorch_lightning.utilities import rank_zero_only
 from pytorch_lightning.utilities.parsing import AttributeDict
 from pytorch_lightning.utilities.types import STEP_OUTPUT
 
@@ -40,49 +40,6 @@
 
 @CALLBACKS.register_module()
 class GPUStatsMonitor(pl.Callback):
-    """Automatically monitors and logs GPU stats during training stage.
-    C{GPUStatsMonitor} is a callback and in order to use it you need to assign a logger
-    in the C{Trainer}.
-
-    Args:
-        memory_utilization: Set to C{True} to monitor used, free and percentage of memory
-            utilization at the start and end of each step. Default: C{True}.
-        gpu_utilization: Set to C{True} to monitor percentage of GPU utilization
-            at the start and end of each step. Default: C{True}.
-        intra_step_time: Set to C{True} to monitor the time of each step. Default: {False}.
-        inter_step_time: Set to C{True} to monitor the time between the end of one step
-            and the start of the next step. Default: C{False}.
-        fan_speed: Set to C{True} to monitor percentage of fan speed. Default: C{False}.
-        temperature: Set to C{True} to monitor the memory and gpu temperature in degree Celsius.
-            Default: C{False}.
-
-    Raises:
-        MisconfigurationException:
-            If NVIDIA driver is not installed, not running on GPUs, or C{Trainer} has no logger.
-
-    Example::
-
-        >>> from pytorch_lightning import Trainer
-        >>> from pytorch_lightning.callbacks import GPUStatsMonitor
-        >>> gpu_stats = GPUStatsMonitor() # doctest: +SKIP
-        >>> trainer = Trainer(callbacks=[gpu_stats]) # doctest: +SKIP
-
-    GPU stats are mainly based on C{nvidia-smi --query-gpu} command. The description of the queries is as follows:
-
-    - **fan.speed** – The fan speed value is the percent of maximum speed that the device's fan is currently
-      intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
-      If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
-      Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
-    - **memory.used** – Total memory allocated by active contexts.
-    - **memory.free** – Total free memory.
-    - **utilization.gpu** – Percent of time over the past sample period during which one or more kernels was
-      executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
-    - **utilization.memory** – Percent of time over the past sample period during which global (device) memory was
-      being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
-    - **temperature.gpu** – Core GPU temperature, in degrees C.
-    - **temperature.memory** – HBM memory temperature, in degrees C.
-    """
-
     def __init__(
         self,
         memory_utilization: bool = True,
@@ -92,6 +49,40 @@ def __init__(
         fan_speed: bool = False,
         temperature: bool = False,
     ):
+        """Automatically monitors and logs GPU stats during training
+        stage. C{GPUStatsMonitor} is a callback and in order to use it
+        you need to assign a logger in the C{Trainer}.
+
+        GPU stats are mainly based on C{nvidia-smi --query-gpu} command. The description of the queries is as follows:
+
+            - C{fan.speed} – The fan speed value is the percent of maximum speed that the device's fan is currently
+              intended to run at. It ranges from 0 to 100 %. Note: The reported speed is the intended fan speed.
+              If the fan is physically blocked and unable to spin, this output will not match the actual fan speed.
+              Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure.
+            - C{memory.used} – Total memory allocated by active contexts.
+            - C{memory.free} – Total free memory.
+            - C{utilization.gpu} – Percent of time over the past sample period during which one or more kernels was
+              executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product.
+            - C{utilization.memory} – Percent of time over the past sample period during which global (device) memory was
+              being read or written. The sample period may be between 1 second and 1/6 second depending on the product.
+            - C{temperature.gpu} – Core GPU temperature, in degrees C.
+            - C{temperature.memory} – HBM memory temperature, in degrees C.
+
+        @type memory_utilization: bool
+        @param memory_utilization: Set to C{True} to monitor used, free and percentage of memory utilization at the start and end of each step. Defaults to C{True}.
+        @type gpu_utilization: bool
+        @param gpu_utilization: Set to C{True} to monitor percentage of GPU utilization at the start and end of each step. Defaults to C{True}.
+        @type intra_step_time: bool
+        @param intra_step_time: Set to C{True} to monitor the time of each step. Defaults to {False}.
+        @type inter_step_time: bool
+        @param inter_step_time: Set to C{True} to monitor the time between the end of one step and the start of the next step. Defaults to C{False}.
+        @type fan_speed: bool
+        @param fan_speed: Set to C{True} to monitor percentage of fan speed. Defaults to C{False}.
+        @type temperature: bool
+        @param temperature: Set to C{True} to monitor the memory and gpu temperature in degree Celsius. Defaults to C{False}.
+        @raises MisconfigurationException: If NVIDIA driver is not installed, not running on GPUs, or C{Trainer} has no logger.
+        """
+
         super().__init__()
 
         if shutil.which("nvidia-smi") is None:
@@ -167,7 +158,9 @@ def on_train_batch_start(
 
         gpu_stat_keys = self._get_gpu_stat_keys()
         gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
-        logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
+        logs = self._parse_gpu_stats(
+            self._device_ids, gpu_stats, gpu_stat_keys
+        )
 
         if self._log_stats.inter_step_time and self._snap_inter_step_time:
             # First log at beginning of second step
@@ -193,9 +186,13 @@ def on_train_batch_end(
         if not trainer._logger_connector.should_update_logs:
             return
 
-        gpu_stat_keys = self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys()
+        gpu_stat_keys = (
+            self._get_gpu_stat_keys() + self._get_gpu_device_stat_keys()
+        )
         gpu_stats = self._get_gpu_stats([k for k, _ in gpu_stat_keys])
-        logs = self._parse_gpu_stats(self._device_ids, gpu_stats, gpu_stat_keys)
+        logs = self._parse_gpu_stats(
+            self._device_ids, gpu_stats, gpu_stat_keys
+        )
 
         if self._log_stats.intra_step_time and self._snap_intra_step_time:
             logs["batch_time/intra_step (ms)"] = (
@@ -213,7 +210,9 @@ def _get_gpu_ids(device_ids: List[int]) -> List[str]:
         cuda_visible_devices: List[str] = os.getenv(
             "CUDA_VISIBLE_DEVICES", default=default
         ).split(",")
-        return [cuda_visible_devices[device_id].strip() for device_id in device_ids]
+        return [
+            cuda_visible_devices[device_id].strip() for device_id in device_ids
+        ]
 
     def _get_gpu_stats(self, queries: List[str]) -> List[List[float]]:
         if not queries:
@@ -251,7 +250,9 @@ def _to_float(x: str) -> float:
 
     @staticmethod
     def _parse_gpu_stats(
-        device_ids: List[int], stats: List[List[float]], keys: List[Tuple[str, str]]
+        device_ids: List[int],
+        stats: List[List[float]],
+        keys: List[Tuple[str, str]],
     ) -> Dict[str, float]:
         """Parse the gpu stats into a loggable dict."""
         logs = {}
@@ -288,6 +289,8 @@ def _get_gpu_device_stat_keys(self) -> List[Tuple[str, str]]:
             stat_keys.append(("fan.speed", "%"))
 
         if self._log_stats.temperature:
-            stat_keys.extend([("temperature.gpu", "°C"), ("temperature.memory", "°C")])
+            stat_keys.extend(
+                [("temperature.gpu", "°C"), ("temperature.memory", "°C")]
+            )
 
         return stat_keys
diff --git a/luxonis_train/callbacks/luxonis_progress_bar.py b/luxonis_train/callbacks/luxonis_progress_bar.py
index d14fcf08..b8bf6512 100644
--- a/luxonis_train/callbacks/luxonis_progress_bar.py
+++ b/luxonis_train/callbacks/luxonis_progress_bar.py
@@ -3,7 +3,11 @@
 
 import lightning.pytorch as pl
 import tabulate
-from lightning.pytorch.callbacks import ProgressBar, RichProgressBar, TQDMProgressBar
+from lightning.pytorch.callbacks import (
+    ProgressBar,
+    RichProgressBar,
+    TQDMProgressBar,
+)
 from rich.console import Console
 from rich.table import Table
 
@@ -14,7 +18,6 @@ class BaseLuxonisProgressBar(ABC, ProgressBar):
     def get_metrics(
         self, trainer: pl.Trainer, pl_module: pl.LightningModule
     ) -> dict[str, int | str | float | dict[str, float]]:
-        # NOTE: there might be a cleaner way of doing this
         items = super().get_metrics(trainer, pl_module)
         items.pop("v_num", None)
         if trainer.training and pl_module.training_step_outputs:
@@ -30,7 +33,8 @@ def print_results(
     ) -> None:
         """Prints results to the console.
 
-        This includes the stage name, loss value, and tables with metrics.
+        This includes the stage name, loss value, and tables with
+        metrics.
 
         @type stage: str
         @param stage: Stage name.
@@ -39,12 +43,13 @@ def print_results(
         @type metrics: Mapping[str, Mapping[str, int | str | float]]
         @param metrics: Metrics in format {table_name: table}.
         """
-        pass
+        ...
 
 
 @CALLBACKS.register_module()
 class LuxonisTQDMProgressBar(TQDMProgressBar, BaseLuxonisProgressBar):
-    """Custom text progress bar based on TQDMProgressBar from Pytorch Lightning."""
+    """Custom text progress bar based on TQDMProgressBar from Pytorch
+    Lightning."""
 
     def __init__(self):
         super().__init__(leave=True)
@@ -71,7 +76,8 @@ def _print_table(
         @type key_name: str
         @param key_name: Name of the key column. Defaults to C{"Name"}.
         @type value_name: str
-        @param value_name: Name of the value column. Defaults to C{"Value"}.
+        @param value_name: Name of the value column. Defaults to
+            C{"Value"}.
         """
         self._rule(title)
         print(
@@ -100,14 +106,15 @@ def print_results(
 
 @CALLBACKS.register_module()
 class LuxonisRichProgressBar(RichProgressBar, BaseLuxonisProgressBar):
-    """Custom rich text progress bar based on RichProgressBar from Pytorch Lightning."""
+    """Custom rich text progress bar based on RichProgressBar from
+    Pytorch Lightning."""
 
     def __init__(self):
         super().__init__(leave=True)
 
     @property
     def console(self) -> Console:
-        if self._console is None:
+        if self._console is None:  # pragma: no cover
             raise RuntimeError(
                 "Console is not initialized for the `LuxonisRichProgressBar`. "
                 "Consider setting `tracker.use_rich_progress_bar` to `False` in the configuration."
@@ -130,7 +137,8 @@ def print_table(
         @type key_name: str
         @param key_name: Name of the key column. Defaults to C{"Name"}.
         @type value_name: str
-        @param value_name: Name of the value column. Defaults to C{"Value"}.
+        @param value_name: Name of the value column. Defaults to
+            C{"Value"}.
         """
         rich_table = Table(
             title=title,
@@ -140,10 +148,7 @@ def print_table(
         rich_table.add_column(key_name, style="magenta")
         rich_table.add_column(value_name, style="white")
         for name, value in table.items():
-            if isinstance(value, float):
-                rich_table.add_row(name, f"{value:.5f}")
-            else:
-                rich_table.add_row(name, str(value))
+            rich_table.add_row(name, f"{value:.5f}")
         self.console.print(rich_table)
 
     def print_results(
@@ -153,7 +158,9 @@ def print_results(
         metrics: Mapping[str, Mapping[str, int | str | float]],
     ) -> None:
         self.console.rule(f"{stage}", style="bold magenta")
-        self.console.print(f"[bold magenta]Loss:[/bold magenta] [white]{loss}[/white]")
+        self.console.print(
+            f"[bold magenta]Loss:[/bold magenta] [white]{loss}[/white]"
+        )
         self.console.print("[bold magenta]Metrics:[/bold magenta]")
         for table_name, table in metrics.items():
             self.print_table(table_name, table)
diff --git a/luxonis_train/callbacks/metadata_logger.py b/luxonis_train/callbacks/metadata_logger.py
index 45ff8717..ab29f7d0 100644
--- a/luxonis_train/callbacks/metadata_logger.py
+++ b/luxonis_train/callbacks/metadata_logger.py
@@ -6,7 +6,7 @@
 import yaml
 
 import luxonis_train
-from luxonis_train.utils.config import Config
+from luxonis_train.utils import Config
 from luxonis_train.utils.registry import CALLBACKS
 
 
@@ -15,8 +15,9 @@ class MetadataLogger(pl.Callback):
     def __init__(self, hyperparams: list[str]):
         """Callback that logs training metadata.
 
-        Metadata include all defined hyperparameters together with git hashes of
-        luxonis-ml and luxonis-train packages. Also stores this information locally.
+        Metadata include all defined hyperparameters together with git
+        hashes of luxonis-ml and luxonis-train packages. Also stores
+        this information locally.
 
         @type hyperparams: list[str]
         @param hyperparams: List of hyperparameters to log.
@@ -25,30 +26,44 @@ def __init__(self, hyperparams: list[str]):
         self.hyperparams = hyperparams
 
     def on_fit_start(
-        self, _: pl.Trainer, pl_module: "luxonis_train.models.LuxonisLightningModule"
+        self,
+        _: pl.Trainer,
+        pl_module: "luxonis_train.models.LuxonisLightningModule",
     ) -> None:
         cfg: Config = pl_module.cfg
 
         hparams = {key: cfg.get(key) for key in self.hyperparams}
 
-        # try to get luxonis-ml and luxonis-train git commit hashes (if installed as editable)
         luxonis_ml_hash = self._get_editable_package_git_hash("luxonis_ml")
-        if luxonis_ml_hash:
+        if luxonis_ml_hash:  # pragma: no cover
             hparams["luxonis_ml"] = luxonis_ml_hash
 
-        luxonis_train_hash = self._get_editable_package_git_hash("luxonis_train")
-        if luxonis_train_hash:
+        luxonis_train_hash = self._get_editable_package_git_hash(
+            "luxonis_train"
+        )
+        if luxonis_train_hash:  # pragma: no cover
             hparams["luxonis_train"] = luxonis_train_hash
 
         pl_module.logger.log_hyperparams(hparams)
-        # also save metadata locally
-        with open(osp.join(pl_module.save_dir, "metadata.yaml"), "w+") as f:
+        with open(osp.join(pl_module.save_dir, "metadata.yaml"), "w") as f:
             yaml.dump(hparams, f, default_flow_style=False)
 
     @staticmethod
-    def _get_editable_package_git_hash(package_name: str) -> str | None:
+    def _get_editable_package_git_hash(
+        package_name: str,
+    ) -> str | None:  # pragma: no cover
+        """Get git hash of an editable package.
+
+        @type package_name: str
+        @param package_name: Name of the package.
+        @rtype: str or None
+        @return: Git hash of the package or None if the package is not
+            installed in editable mode.
+        """
         try:
             distribution = pkg_resources.get_distribution(package_name)
+            if distribution.location is None:
+                return None
             package_location = osp.join(distribution.location, package_name)
 
             # remove any additional folders in path (e.g. "/src")
diff --git a/luxonis_train/callbacks/module_freezer.py b/luxonis_train/callbacks/module_freezer.py
index 4f73ff30..de0afa99 100644
--- a/luxonis_train/callbacks/module_freezer.py
+++ b/luxonis_train/callbacks/module_freezer.py
@@ -13,7 +13,8 @@ def __init__(self, frozen_modules: list[tuple[nn.Module, int]]):
         """Callback that freezes parts of the model.
 
         @type frozen_modules: list[tuple[nn.Module, int]]
-        @param frozen_modules: List of tuples of modules and epochs to freeze until.
+        @param frozen_modules: List of tuples of modules and epochs to
+            freeze until.
         """
         super().__init__()
         self.frozen_modules = frozen_modules
diff --git a/luxonis_train/callbacks/needs_checkpoint.py b/luxonis_train/callbacks/needs_checkpoint.py
index 30355e82..b3de6aed 100644
--- a/luxonis_train/callbacks/needs_checkpoint.py
+++ b/luxonis_train/callbacks/needs_checkpoint.py
@@ -10,7 +10,9 @@
 
 class NeedsCheckpoint(pl.Callback):
     def __init__(
-        self, preferred_checkpoint: Literal["metric", "loss"] = "metric", **kwargs
+        self,
+        preferred_checkpoint: Literal["metric", "loss"] = "metric",
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.preferred_checkpoint = preferred_checkpoint
@@ -40,7 +42,8 @@ def _get_checkpoint(
                 )
             return path
 
-    def _get_other_type(self, checkpoint_type: str) -> str:
+    @staticmethod
+    def _get_other_type(checkpoint_type: str) -> str:
         if checkpoint_type == "loss":
             return "metric"
         return "loss"
diff --git a/luxonis_train/callbacks/test_on_train_end.py b/luxonis_train/callbacks/test_on_train_end.py
index f2bb09ec..a60a16dd 100644
--- a/luxonis_train/callbacks/test_on_train_end.py
+++ b/luxonis_train/callbacks/test_on_train_end.py
@@ -27,4 +27,6 @@ def on_train_end(
         for callback in trainer.callbacks:  # type: ignore
             if isinstance(callback, ModelCheckpoint):
                 if hash(callback.monitor) in best_paths:
-                    callback.best_model_path = best_paths[hash(callback.monitor)]
+                    callback.best_model_path = best_paths[
+                        hash(callback.monitor)
+                    ]
diff --git a/luxonis_train/callbacks/upload_checkpoint.py b/luxonis_train/callbacks/upload_checkpoint.py
index 29da59ef..b9753e94 100644
--- a/luxonis_train/callbacks/upload_checkpoint.py
+++ b/luxonis_train/callbacks/upload_checkpoint.py
@@ -12,7 +12,8 @@
 
 @CALLBACKS.register_module()
 class UploadCheckpoint(pl.Callback):
-    """Callback that uploads best checkpoint based on the validation loss."""
+    """Callback that uploads best checkpoint based on the validation
+    loss."""
 
     def __init__(self):
         """Constructs `UploadCheckpoint`.
@@ -43,7 +44,9 @@ def on_save_checkpoint(
                 if curr_best_checkpoint not in self.last_best_checkpoints:
                     self.logger.info("Uploading checkpoint...")
                     temp_filename = (
-                        Path(curr_best_checkpoint).parent.with_suffix(".ckpt").name
+                        Path(curr_best_checkpoint)
+                        .parent.with_suffix(".ckpt")
+                        .name
                     )
                     torch.save(checkpoint, temp_filename)
                     module.logger.upload_artifact(temp_filename, typ="weights")
diff --git a/luxonis_train/core/core.py b/luxonis_train/core/core.py
index c683773c..cffa3ff1 100644
--- a/luxonis_train/core/core.py
+++ b/luxonis_train/core/core.py
@@ -3,7 +3,7 @@
 import threading
 from logging import getLogger
 from pathlib import Path
-from typing import Any, Literal
+from typing import Any, Literal, Mapping, overload
 
 import lightning.pytorch as pl
 import lightning_utilities.core.rank_zero as rank_zero_module
@@ -16,15 +16,17 @@
 from luxonis_ml.nn_archive import ArchiveGenerator
 from luxonis_ml.nn_archive.config import CONFIG_VERSION
 from luxonis_ml.utils import LuxonisFileSystem, reset_logging, setup_logging
+from typeguard import typechecked
 
 from luxonis_train.attached_modules.visualizers import get_unnormalized_images
-from luxonis_train.callbacks import LuxonisRichProgressBar, LuxonisTQDMProgressBar
+from luxonis_train.callbacks import (
+    LuxonisRichProgressBar,
+    LuxonisTQDMProgressBar,
+)
+from luxonis_train.loaders import BaseLoaderTorch, collate_fn
 from luxonis_train.models import LuxonisLightningModule
-from luxonis_train.utils.config import Config
-from luxonis_train.utils.general import DatasetMetadata
-from luxonis_train.utils.loaders import BaseLoaderTorch, collate_fn
+from luxonis_train.utils import Config, DatasetMetadata, LuxonisTrackerPL
 from luxonis_train.utils.registry import LOADERS
-from luxonis_train.utils.tracker import LuxonisTrackerPL
 
 from .utils.export_utils import (
     blobconverter_export,
@@ -41,8 +43,8 @@
 class LuxonisModel:
     """Common logic of the core components.
 
-    This class contains common logic of the core components (trainer, evaluator,
-    exporter, etc.).
+    This class contains common logic of the core components (trainer,
+    evaluator, exporter, etc.).
     """
 
     def __init__(
@@ -80,6 +82,7 @@ def __init__(
             self.cfg.tracker.save_directory, self.tracker.run_name
         )
         self.log_file = osp.join(self.run_save_dir, "luxonis_train.log")
+        self.error_message = None
 
         # NOTE: to add the file handler (we only get the save dir now,
         # but we want to use the logger before)
@@ -89,10 +92,16 @@ def __init__(
         # NOTE: overriding logger in pl so it uses our logger to log device info
         rank_zero_module.log = logger
 
-        deterministic = False
         if self.cfg.trainer.seed is not None:
             pl.seed_everything(self.cfg.trainer.seed, workers=True)
-            deterministic = True
+
+        self.pl_trainer = create_trainer(
+            self.cfg.trainer,
+            logger=self.tracker,
+            callbacks=LuxonisRichProgressBar()
+            if self.cfg.trainer.use_rich_progress_bar
+            else LuxonisTQDMProgressBar(),
+        )
 
         self.train_augmentations = Augmentations(
             image_size=self.cfg.trainer.preprocessing.train_image_size,
@@ -114,15 +123,6 @@ def __init__(
             only_normalize=True,
         )
 
-        self.pl_trainer = create_trainer(
-            self.cfg,
-            logger=self.tracker,
-            deterministic=deterministic,
-            callbacks=LuxonisRichProgressBar()
-            if self.cfg.trainer.use_rich_progress_bar
-            else LuxonisTQDMProgressBar(),
-        )
-
         self.loaders: dict[str, BaseLoaderTorch] = {}
         for view in ["train", "val", "test"]:
             loader_name = self.cfg.loader.name
@@ -155,27 +155,31 @@ def __init__(
         sampler = None
         # TODO: implement weighted sampler
         if self.cfg.trainer.use_weighted_sampler:
-            raise NotImplementedError("Weighted sampler is not implemented yet.")
+            raise NotImplementedError(
+                "Weighted sampler is not implemented yet."
+            )
 
         self.pytorch_loaders = {
             view: torch_data.DataLoader(
                 self.loaders[view],
                 batch_size=self.cfg.trainer.batch_size,
-                num_workers=self.cfg.trainer.num_workers,
+                num_workers=self.cfg.trainer.n_workers,
                 collate_fn=collate_fn,
                 shuffle=view == "train",
                 drop_last=(
-                    self.cfg.trainer.skip_last_batch if view == "train" else False
+                    self.cfg.trainer.skip_last_batch
+                    if view == "train"
+                    else False
                 ),
                 pin_memory=self.cfg.trainer.pin_memory,
                 sampler=sampler if view == "train" else None,
             )
             for view in ["train", "val", "test"]
         }
-        self.error_message = None
 
-        self.dataset_metadata = DatasetMetadata.from_loader(self.loaders["train"])
-        self.dataset_metadata.set_loader(self.pytorch_loaders["train"])
+        self.dataset_metadata = DatasetMetadata.from_loader(
+            self.loaders["train"]
+        )
 
         self.cfg.save_data(osp.join(self.run_save_dir, "config.yaml"))
 
@@ -195,7 +199,7 @@ def _train(self, resume: str | None, *args, **kwargs):
         status = "success"
         try:
             self.pl_trainer.fit(*args, ckpt_path=resume, **kwargs)
-        except Exception as e:
+        except Exception as e:  # pragma: no cover
             logger.exception("Encountered an exception during training.")
             status = "failed"
             raise e
@@ -211,29 +215,34 @@ def train(
         @type new_thread: bool
         @param new_thread: Runs training in new thread if set to True.
         @type resume_weights: str | None
-        @param resume_weights: Path to checkpoint to resume training from.
+        @param resume_weights: Path to the checkpoint from which to to
+            resume the training.
         """
 
         if self.cfg.trainer.matmul_precision is not None:
             logger.info(
                 f"Setting matmul precision to {self.cfg.trainer.matmul_precision}"
             )
-            torch.set_float32_matmul_precision(self.cfg.trainer.matmul_precision)
+            torch.set_float32_matmul_precision(
+                self.cfg.trainer.matmul_precision
+            )
 
         if resume_weights is not None:
             resume_weights = str(
                 LuxonisFileSystem.download(resume_weights, self.run_save_dir)
             )
 
-        def graceful_exit(signum: int, _):
-            logger.info(f"{signal.Signals(signum).name} received, stopping training...")
+        def graceful_exit(signum: int, _):  # pragma: no cover
+            logger.info(
+                f"{signal.Signals(signum).name} received, stopping training..."
+            )
             ckpt_path = osp.join(self.run_save_dir, "resume.ckpt")
             self.pl_trainer.save_checkpoint(ckpt_path)
             self.tracker.upload_artifact(
                 ckpt_path, typ="checkpoints", name="resume.ckpt"
             )
             self.tracker._finalize(status="failed")
-            exit(0)
+            exit()
 
         signal.signal(signal.SIGTERM, graceful_exit)
 
@@ -249,7 +258,7 @@ def graceful_exit(signum: int, _):
             logger.info("Training finished")
             logger.info(f"Checkpoints saved in: {self.run_save_dir}")
 
-        else:
+        else:  # pragma: no cover
             # Every time exception happens in the Thread, this hook will activate
             def thread_exception_hook(args):
                 self.error_message = str(args.exc_value)
@@ -269,7 +278,10 @@ def thread_exception_hook(args):
             self.thread.start()
 
     def export(
-        self, onnx_save_path: str | None = None, *, weights: str | Path | None = None
+        self,
+        onnx_save_path: str | None = None,
+        *,
+        weights: str | Path | None = None,
     ) -> None:
         """Runs export.
 
@@ -290,8 +302,12 @@ def export(
         export_save_dir = Path(self.run_save_dir, "export")
         export_save_dir.mkdir(parents=True, exist_ok=True)
 
-        export_path = export_save_dir / (self.cfg.exporter.name or self.cfg.model.name)
-        onnx_save_path = onnx_save_path or str(export_path.with_suffix(".onnx"))
+        export_path = export_save_dir / (
+            self.cfg.exporter.name or self.cfg.model.name
+        )
+        onnx_save_path = onnx_save_path or str(
+            export_path.with_suffix(".onnx")
+        )
 
         with replace_weights(self.lightning_module, weights):
             output_names = self.lightning_module.export_onnx(
@@ -301,7 +317,9 @@ def export(
         try_onnx_simplify(onnx_save_path)
         self._exported_models["onnx"] = Path(onnx_save_path)
 
-        scale_values, mean_values, reverse_channels = get_preprocessing(self.cfg)
+        scale_values, mean_values, reverse_channels = get_preprocessing(
+            self.cfg
+        )
 
         if self.cfg.exporter.blobconverter.active:
             try:
@@ -313,7 +331,9 @@ def export(
                     str(export_save_dir),
                     onnx_save_path,
                 )
-                self._exported_models["blob"] = export_path.with_suffix(".blob")
+                self._exported_models["blob"] = export_path.with_suffix(
+                    ".blob"
+                )
             except ImportError:
                 logger.error("Failed to import `blobconverter`")
                 logger.warning(
@@ -340,36 +360,52 @@ def export(
         for path in self._exported_models.values():
             if self.cfg.exporter.upload_to_run:
                 self.tracker.upload_artifact(path, typ="export")
-            if self.cfg.exporter.upload_url is not None:
+            if self.cfg.exporter.upload_url is not None:  # pragma: no cover
                 LuxonisFileSystem.upload(path, self.cfg.exporter.upload_url)
 
         with open(export_path.with_suffix(".yaml"), "w") as f:
             yaml.dump(modelconverter_config, f)
             if self.cfg.exporter.upload_to_run:
                 self.tracker.upload_artifact(f.name, name=f.name, typ="export")
-            if self.cfg.exporter.upload_url is not None:
+            if self.cfg.exporter.upload_url is not None:  # pragma: no cover
                 LuxonisFileSystem.upload(f.name, self.cfg.exporter.upload_url)
 
+    @overload
     def test(
-        self, new_thread: bool = False, view: Literal["train", "test", "val"] = "val"
-    ) -> None:
+        self,
+        new_thread: Literal[False] = ...,
+        view: Literal["train", "test", "val"] = "val",
+    ) -> Mapping[str, float]: ...
+
+    @overload
+    def test(
+        self,
+        new_thread: Literal[True] = ...,
+        view: Literal["train", "test", "val"] = "val",
+    ) -> None: ...
+
+    @typechecked
+    def test(
+        self,
+        new_thread: bool = False,
+        view: Literal["train", "val", "test"] = "val",
+    ) -> Mapping[str, float] | None:
         """Runs testing.
 
         @type new_thread: bool
         @param new_thread: Runs testing in a new thread if set to True.
         @type view: Literal["train", "test", "val"]
         @param view: Which view to run the testing on. Defauls to "val".
+        @rtype: Mapping[str, float] | None
+        @return: If new_thread is False, returns a dictionary test
+            results.
         """
 
-        if view not in self.pytorch_loaders:
-            raise ValueError(
-                f"View {view} is not valid. Valid views are: 'train', 'val', 'test'."
-            )
         loader = self.pytorch_loaders[view]
 
         if not new_thread:
-            self.pl_trainer.test(self.lightning_module, loader)
-        else:
+            return self.pl_trainer.test(self.lightning_module, loader)[0]
+        else:  # pragma: no cover
             self.thread = threading.Thread(
                 target=self.pl_trainer.test,
                 args=(self.lightning_module, loader),
@@ -377,22 +413,24 @@ def test(
             )
             self.thread.start()
 
-    def infer(self, view: str = "val", save_dir: str | Path | None = None) -> None:
+    @typechecked
+    def infer(
+        self,
+        view: Literal["train", "val", "test"] = "val",
+        save_dir: str | Path | None = None,
+    ) -> None:
         """Runs inference.
 
         @type view: str
-        @param view: Which split to run the inference on. Valid values are: 'train',
-            'val', 'test'. Defaults to "val".
+        @param view: Which split to run the inference on. Valid values
+            are: 'train', 'val', 'test'. Defaults to "val".
         @type save_dir: str | Path | None
-        @param save_dir: Directory where to save the visualizations. If not specified,
-            visualizations will be rendered on the screen.
+        @param save_dir: Directory where to save the visualizations. If
+            not specified, visualizations will be rendered on the
+            screen.
         """
         self.lightning_module.eval()
 
-        if view not in self.pytorch_loaders:
-            raise ValueError(
-                f"View {view} is not valid. Valid views are: 'train', 'val', 'test'."
-            )
         for inputs, labels in self.pytorch_loaders[view]:
             images = get_unnormalized_images(self.cfg, inputs)
             outputs = self.lightning_module.forward(
@@ -418,18 +456,24 @@ def _objective(trial: optuna.trial.Trial) -> float:
                 **tracker_params,
             )
 
-            run_save_dir = osp.join(cfg_tracker.save_directory, child_tracker.run_name)
+            run_save_dir = osp.join(
+                cfg_tracker.save_directory, child_tracker.run_name
+            )
 
             assert self.cfg.tuner is not None
-            curr_params = get_trial_params(all_augs, self.cfg.tuner.params, trial)
+            curr_params = get_trial_params(
+                all_augs, self.cfg.tuner.params, trial
+            )
             curr_params["model.predefined_model"] = None
 
             cfg_copy = self.cfg.model_copy(deep=True)
+            # manually remove Normalize so it doesn't
+            # get duplicated when creating new cfg instance
             cfg_copy.trainer.preprocessing.augmentations = [
                 a
                 for a in cfg_copy.trainer.preprocessing.augmentations
                 if a.name != "Normalize"
-            ]  # manually remove Normalize so it doesn't duplicate it when creating new cfg instance
+            ]
             cfg = Config.get_config(cfg_copy.model_dump(), curr_params)
 
             child_tracker.log_hyperparams(curr_params)
@@ -449,18 +493,16 @@ def _objective(trial: optuna.trial.Trial) -> float:
                 else LuxonisTQDMProgressBar()
             ]
 
-            pruner_callback = PyTorchLightningPruningCallback(trial, monitor="val/loss")
+            pruner_callback = PyTorchLightningPruningCallback(
+                trial, monitor="val/loss"
+            )
             callbacks.append(pruner_callback)
-            deterministic = False
-            if self.cfg.trainer.seed:
+
+            if self.cfg.trainer.seed is not None:
                 pl.seed_everything(cfg.trainer.seed, workers=True)
-                deterministic = True
 
             pl_trainer = create_trainer(
-                cfg,
-                logger=child_tracker,
-                callbacks=callbacks,
-                deterministic=deterministic,
+                cfg.trainer, logger=child_tracker, callbacks=callbacks
             )
 
             try:
@@ -475,7 +517,9 @@ def _objective(trial: optuna.trial.Trial) -> float:
             except optuna.TrialPruned as e:
                 logger.info(e)
 
-            if "val/loss" not in pl_trainer.callback_metrics:
+            if (
+                "val/loss" not in pl_trainer.callback_metrics
+            ):  # pragma: no cover
                 raise ValueError(
                     "No validation loss found. "
                     "This can happen if `TestOnTrainEnd` callback is used."
@@ -485,9 +529,13 @@ def _objective(trial: optuna.trial.Trial) -> float:
 
         cfg_tuner = self.cfg.tuner
         if cfg_tuner is None:
-            raise ValueError("You have to specify the `tuner` section in config.")
+            raise ValueError(
+                "You have to specify the `tuner` section in config."
+            )
 
-        all_augs = [a.name for a in self.cfg.trainer.preprocessing.augmentations]
+        all_augs = [
+            a.name for a in self.cfg.trainer.preprocessing.augmentations
+        ]
         rank = rank_zero_only.rank
         cfg_tracker = self.cfg.tracker
         tracker_params = cfg_tracker.model_dump()
@@ -499,7 +547,7 @@ def _objective(trial: optuna.trial.Trial) -> float:
             is_sweep=False,
             **tracker_params,
         )
-        if self.parent_tracker.is_mlflow:
+        if self.parent_tracker.is_mlflow:  # pragma: no cover
             # Experiment needs to be interacted with to create actual MLFlow run
             self.parent_tracker.experiment["mlflow"].active_run()
 
@@ -515,7 +563,7 @@ def _objective(trial: optuna.trial.Trial) -> float:
         if cfg_tuner.storage.active:
             if cfg_tuner.storage.storage_type == "local":
                 storage = "sqlite:///study_local.db"
-            else:
+            else:  # pragma: no cover
                 storage = "postgresql://{}:{}@{}:{}/{}".format(
                     self.cfg.ENVIRON.POSTGRES_USER,
                     self.cfg.ENVIRON.POSTGRES_PASSWORD,
@@ -540,7 +588,7 @@ def _objective(trial: optuna.trial.Trial) -> float:
 
         self.parent_tracker.log_hyperparams(study.best_params)
 
-        if self.cfg.tracker.is_wandb:
+        if self.cfg.tracker.is_wandb:  # pragma: no cover
             # If wandb used then init parent tracker separately at the end
             wandb_parent_tracker = LuxonisTrackerPL(
                 rank=rank_zero_only.rank,
@@ -555,8 +603,8 @@ def archive(self, path: str | Path | None = None) -> Path:
         """Generates an NN Archive out of a model executable.
 
         @type path: str | Path | None
-        @param path: Path to the model executable. If not specified, the model will be
-            exported first.
+        @param path: Path to the model executable. If not specified, the
+            model will be exported first.
         @rtype: Path
         @return: Path to the generated NN Archive.
         """
@@ -583,8 +631,12 @@ def _mult(lst: list[float | int]) -> list[float]:
             return [round(x * 255.0, 5) for x in lst]
 
         preprocessing = {  # TODO: keep preprocessing same for each input?
-            "mean": _mult(self.cfg.trainer.preprocessing.normalize.params["mean"]),
-            "scale": _mult(self.cfg.trainer.preprocessing.normalize.params["std"]),
+            "mean": _mult(
+                self.cfg.trainer.preprocessing.normalize.params["mean"]
+            ),
+            "scale": _mult(
+                self.cfg.trainer.preprocessing.normalize.params["std"]
+            ),
             "reverse_channels": self.cfg.trainer.preprocessing.train_rgb,
             "interleaved_to_planar": False,  # TODO: make it modifiable?
         }
@@ -642,8 +694,10 @@ def _mult(lst: list[float | int]) -> list[float]:
 
         logger.info(f"NN Archive saved to {archive_path}")
 
-        if self.cfg.archiver.upload_url is not None:
-            LuxonisFileSystem.upload(archive_path, self.cfg.archiver.upload_url)
+        if self.cfg.archiver.upload_url is not None:  # pragma: no cover
+            LuxonisFileSystem.upload(
+                archive_path, self.cfg.archiver.upload_url
+            )
 
         if self.cfg.archiver.upload_to_run:
             self.tracker.upload_artifact(archive_path, typ="archive")
@@ -655,14 +709,15 @@ def get_status(self) -> tuple[int, int]:
         """Get current status of training.
 
         @rtype: tuple[int, int]
-        @return: First element is current epoch, second element is total number of
-            epochs.
+        @return: First element is current epoch, second element is total
+            number of epochs.
         """
         return self.lightning_module.get_status()
 
     @rank_zero_only
     def get_status_percentage(self) -> float:
-        """Return percentage of current training, takes into account early stopping.
+        """Return percentage of current training, takes into account
+        early stopping.
 
         @rtype: float
         @return: Percentage of current training in range 0-100.
@@ -671,7 +726,8 @@ def get_status_percentage(self) -> float:
 
     @rank_zero_only
     def get_error_message(self) -> str | None:
-        """Return error message if one occurs while running in thread, otherwise None.
+        """Return error message if one occurs while running in thread,
+        otherwise None.
 
         @rtype: str | None
         @return: Error message
@@ -680,10 +736,12 @@ def get_error_message(self) -> str | None:
 
     @rank_zero_only
     def get_min_loss_checkpoint_path(self) -> str | None:
-        """Return best checkpoint path with respect to minimal validation loss.
+        """Return best checkpoint path with respect to minimal
+        validation loss.
 
         @rtype: str
-        @return: Path to best checkpoint with respect to minimal validation loss
+        @return: Path to best checkpoint with respect to minimal
+            validation loss
         """
         if not self.pl_trainer.checkpoint_callbacks:
             return None
@@ -691,10 +749,12 @@ def get_min_loss_checkpoint_path(self) -> str | None:
 
     @rank_zero_only
     def get_best_metric_checkpoint_path(self) -> str | None:
-        """Return best checkpoint path with respect to best validation metric.
+        """Return best checkpoint path with respect to best validation
+        metric.
 
         @rtype: str
-        @return: Path to best checkpoint with respect to best validation metric
+        @return: Path to best checkpoint with respect to best validation
+            metric
         """
         if len(self.pl_trainer.checkpoint_callbacks) < 2:
             return None
diff --git a/luxonis_train/core/utils/archive_utils.py b/luxonis_train/core/utils/archive_utils.py
index 72cdefc7..96c2bcde 100644
--- a/luxonis_train/core/utils/archive_utils.py
+++ b/luxonis_train/core/utils/archive_utils.py
@@ -15,7 +15,7 @@
     ImplementedHeads,
     ImplementedHeadsIsSoxtmaxed,
 )
-from luxonis_train.utils.config import Config
+from luxonis_train.utils import Config
 
 logger = logging.getLogger(__name__)
 
@@ -63,7 +63,7 @@ def _from_onnx_dtype(dtype: int) -> DataType:
         TensorProto.FLOAT: "float32",
         TensorProto.FLOAT16: "float16",
     }
-    if dtype not in dtype_map:
+    if dtype not in dtype_map:  # pragma: no cover
         raise ValueError(f"Unsupported ONNX data type: `{dtype}`")
 
     return DataType(dtype_map[dtype])
@@ -72,7 +72,7 @@ def _from_onnx_dtype(dtype: int) -> DataType:
 def _load_onnx_model(onnx_path: Path) -> onnx.ModelProto:
     try:
         return onnx.load(str(onnx_path))
-    except Exception as e:
+    except Exception as e:  # pragma: no cover
         raise ValueError(f"Failed to load ONNX model: `{onnx_path}`") from e
 
 
@@ -98,7 +98,9 @@ def _get_onnx_inputs(onnx_path: Path) -> dict[str, MetadataDict]:
     for inp in model.graph.input:
         shape = [dim.dim_value for dim in inp.type.tensor_type.shape.dim]
         inputs[inp.name]["shape"] = shape
-        inputs[inp.name]["dtype"] = _from_onnx_dtype(inp.type.tensor_type.elem_type)
+        inputs[inp.name]["dtype"] = _from_onnx_dtype(
+            inp.type.tensor_type.elem_type
+        )
 
     return inputs
 
@@ -116,7 +118,7 @@ def _get_classes(
                 node_task = "segmentation"
             case "ImplicitKeypointBBoxHead" | "EfficientKeypointBBoxHead":
                 node_task = "keypoints"
-            case _:
+            case _:  # pragma: no cover
                 raise ValueError("Node does not map to a default task.")
 
     return classes.get(node_task, [])
@@ -137,7 +139,9 @@ def _get_head_specific_parameters(
 
     parameters = {}
     if head_name == "ClassificationHead":
-        parameters["is_softmax"] = getattr(ImplementedHeadsIsSoxtmaxed, head_name).value
+        parameters["is_softmax"] = getattr(
+            ImplementedHeadsIsSoxtmaxed, head_name
+        ).value
     elif head_name == "EfficientBBoxHead":
         parameters["subtype"] = ObjectDetectionSubtypeYOLO.YOLOv6.value
         head_node = nodes[head_alias]
@@ -145,7 +149,9 @@ def _get_head_specific_parameters(
         parameters["conf_threshold"] = head_node.conf_thres
         parameters["max_det"] = head_node.max_det
     elif head_name in ["SegmentationHead", "BiSeNetHead"]:
-        parameters["is_softmax"] = getattr(ImplementedHeadsIsSoxtmaxed, head_name).value
+        parameters["is_softmax"] = getattr(
+            ImplementedHeadsIsSoxtmaxed, head_name
+        ).value
     elif head_name == "ImplicitKeypointBBoxHead":
         parameters["subtype"] = ObjectDetectionSubtypeYOLO.YOLOv7.value
         head_node = nodes[head_alias]
@@ -161,18 +167,21 @@ def _get_head_specific_parameters(
         parameters["conf_threshold"] = head_node.conf_thres
         parameters["max_det"] = head_node.max_det
         parameters["n_keypoints"] = head_node.n_keypoints
-    else:
+    else:  # pragma: no cover
         raise ValueError("Unknown head name")
     return parameters
 
 
-def _get_head_outputs(outputs: list[dict], head_name: str, head_type: str) -> list[str]:
+def _get_head_outputs(
+    outputs: list[dict], head_name: str, head_type: str
+) -> list[str]:
     """Get model outputs in a head-specific format.
 
     @type outputs: list[dict]
     @param outputs: List of NN Archive outputs.
     @type head_name: str
-    @param head_name: Type of the head (e.g. 'EfficientBBoxHead') or its custom alias.
+    @param head_name: Type of the head (e.g. 'EfficientBBoxHead') or its
+        custom alias.
     @type head_type: str
     @param head_name: Type of the head (e.g. 'EfficientBBoxHead').
     @rtype: list[str]
@@ -238,7 +247,9 @@ def get_heads(
                     task = str(next(iter(task.values())))
 
                 classes = _get_classes(node_name, task, class_dict)
-                head_outputs = _get_head_outputs(outputs, node_alias, node_name)
+                head_outputs = _get_head_outputs(
+                    outputs, node_alias, node_name
+                )
                 head_dict = {
                     "parser": parser,
                     "metadata": {
diff --git a/luxonis_train/core/utils/export_utils.py b/luxonis_train/core/utils/export_utils.py
index 3b34a912..b4863f1b 100644
--- a/luxonis_train/core/utils/export_utils.py
+++ b/luxonis_train/core/utils/export_utils.py
@@ -42,7 +42,7 @@ def try_onnx_simplify(onnx_path: str) -> None:
         model_onnx = onnx.load(onnx_path)
         onnx_model, check = onnxsim.simplify(model_onnx)
         if not check:
-            raise RuntimeError("ONNX simplify failed.")
+            raise RuntimeError("ONNX simplify failed.")  # pragma: no cover
         onnx.save(onnx_model, onnx_path)
         logger.info(f"ONNX model saved to {onnx_path}")
 
@@ -52,7 +52,7 @@ def try_onnx_simplify(onnx_path: str) -> None:
             "`onnxsim` not installed. Skipping ONNX model simplification. "
             "Ensure `onnxsim` is installed in your environment."
         )
-    except RuntimeError:
+    except RuntimeError:  # pragma: no cover
         logger.error(
             "Failed to simplify ONNX model. Proceeding without simplification."
         )
@@ -100,7 +100,7 @@ def blobconverter_export(
 
     logger.info("Converting ONNX to .blob")
 
-    optimizer_params = []
+    optimizer_params: list[str] = []
     if scale_values:
         optimizer_params.append(f"--scale_values={scale_values}")
     if mean_values:
@@ -111,7 +111,7 @@ def blobconverter_export(
     blob_path = blobconverter.from_onnx(
         model=onnx_path,
         optimizer_params=optimizer_params,
-        data_type=cfg.data_type,
+        data_type=cfg.data_type.upper(),
         shaves=cfg.blobconverter.shaves,
         version=cfg.blobconverter.version,
         use_cache=False,
diff --git a/luxonis_train/core/utils/train_utils.py b/luxonis_train/core/utils/train_utils.py
index 3a45a85b..73b615cb 100644
--- a/luxonis_train/core/utils/train_utils.py
+++ b/luxonis_train/core/utils/train_utils.py
@@ -1,9 +1,11 @@
+from typing import Any
+
 import lightning.pytorch as pl
 
-from luxonis_train.utils.config import Config
+from luxonis_train.utils.config import TrainerConfig
 
 
-def create_trainer(cfg: Config, **kwargs) -> pl.Trainer:
+def create_trainer(cfg: TrainerConfig, **kwargs: Any) -> pl.Trainer:
     """Creates Pytorch Lightning trainer.
 
     @type cfg: Config
@@ -13,13 +15,14 @@ def create_trainer(cfg: Config, **kwargs) -> pl.Trainer:
     @return: Pytorch Lightning trainer.
     """
     return pl.Trainer(
-        accelerator=cfg.trainer.accelerator,
-        devices=cfg.trainer.devices,
-        strategy=cfg.trainer.strategy,
-        max_epochs=cfg.trainer.epochs,
-        accumulate_grad_batches=cfg.trainer.accumulate_grad_batches,
-        check_val_every_n_epoch=cfg.trainer.validation_interval,
-        num_sanity_val_steps=cfg.trainer.num_sanity_val_steps,
-        profiler=cfg.trainer.profiler,
+        accelerator=cfg.accelerator,
+        devices=cfg.devices,
+        strategy=cfg.strategy,
+        max_epochs=cfg.epochs,
+        accumulate_grad_batches=cfg.accumulate_grad_batches,
+        check_val_every_n_epoch=cfg.validation_interval,
+        num_sanity_val_steps=cfg.n_sanity_val_steps,
+        profiler=cfg.profiler,
+        deterministic=cfg.deterministic,
         **kwargs,
     )
diff --git a/luxonis_train/core/utils/tune_utils.py b/luxonis_train/core/utils/tune_utils.py
index e2fe692e..d9d6c4c0 100644
--- a/luxonis_train/core/utils/tune_utils.py
+++ b/luxonis_train/core/utils/tune_utils.py
@@ -61,17 +61,23 @@ def get_trial_params(
             case "int", [int(low), int(high), *tail]:
                 step = tail[0] if tail else 1
                 if not isinstance(step, int):
-                    raise ValueError(f"Step for int type must be int, but got {step}")
+                    raise ValueError(
+                        f"Step for int type must be int, but got {step}"
+                    )
                 new_value = trial.suggest_int(key_name, low, high, step=step)
             case "loguniform", [float(low), float(high)]:
                 new_value = trial.suggest_loguniform(key_name, low, high)
             case "uniform", [float(low), float(high)]:
                 new_value = trial.suggest_uniform(key_name, low, high)
             case _, _:
-                raise KeyError(f"Combination of {key_type} and {value} not supported")
+                raise KeyError(
+                    f"Combination of {key_type} and {value} not supported"
+                )
 
         new_params[key_name] = new_value
 
     if len(new_params) == 0:
-        raise ValueError("No paramteres to tune. Specify them under `tuner.params`.")
+        raise ValueError(
+            "No paramteres to tune. Specify them under `tuner.params`."
+        )
     return new_params
diff --git a/luxonis_train/utils/loaders/__init__.py b/luxonis_train/loaders/__init__.py
similarity index 100%
rename from luxonis_train/utils/loaders/__init__.py
rename to luxonis_train/loaders/__init__.py
diff --git a/luxonis_train/utils/loaders/base_loader.py b/luxonis_train/loaders/base_loader.py
similarity index 65%
rename from luxonis_train/utils/loaders/base_loader.py
rename to luxonis_train/loaders/base_loader.py
index 5e884955..b6b8a863 100644
--- a/luxonis_train/utils/loaders/base_loader.py
+++ b/luxonis_train/loaders/base_loader.py
@@ -1,16 +1,17 @@
 from abc import ABC, abstractmethod
 
 import torch
-from luxonis_ml.data import Augmentations
+from luxonis_ml.data import Augmentations, LabelType
 from luxonis_ml.utils.registry import AutoRegisterMeta
 from torch import Size, Tensor
 from torch.utils.data import Dataset
 
 from luxonis_train.utils.registry import LOADERS
-from luxonis_train.utils.types import Labels, LabelType
+from luxonis_train.utils.types import Labels
 
 LuxonisLoaderTorchOutput = tuple[dict[str, Tensor], Labels]
-"""LuxonisLoaderTorchOutput is a tuple of source tensors and corresponding labels."""
+"""LuxonisLoaderTorchOutput is a tuple of source tensors and
+corresponding labels."""
 
 
 class BaseLoaderTorch(
@@ -20,8 +21,8 @@ class BaseLoaderTorch(
     register=False,
     registry=LOADERS,
 ):
-    """Base abstract loader class that enforces LuxonisLoaderTorchOutput output label
-    structure."""
+    """Base abstract loader class that enforces LuxonisLoaderTorchOutput
+    output label structure."""
 
     def __init__(
         self,
@@ -38,6 +39,8 @@ def image_source(self) -> str:
         """Name of the input image group.
 
         Example: 'image'
+
+        @type: str
         """
         if self._image_source is None:
             raise ValueError("image_source is not set")
@@ -47,39 +50,46 @@ def image_source(self) -> str:
     @abstractmethod
     def input_shapes(self) -> dict[str, Size]:
         """
-        Shape of each loader group (sub-element), WITHOUT batch dimension.
+        Shape (c, h, w) of each loader group (sub-element), WITHOUT batch dimension.
         Examples:
 
-        1. Single image input::
-            {
-                'image': torch.Size([3, 224, 224]),
-            }
-
-        2. Image and segmentation input::
-            {
-                'image': torch.Size([3, 224, 224]),
-                'segmentation': torch.Size([1, 224, 224]),
-            }
-
-        3. Left image, right image and disparity input::
-            {
-                'left': torch.Size([3, 224, 224]),
-                'right': torch.Size([3, 224, 224]),
-                'disparity': torch.Size([1, 224, 224]),
-            }
-
-        4. Image, keypoints, and point cloud input::
-            {
-                'image': torch.Size([3, 224, 224]),
-                'keypoints': torch.Size([17, 2]),
-                'point_cloud': torch.Size([20000, 3]),
-            }
-
-        @rtype: dict[str, Size]
-        @return: A dictionary mapping group names to their shapes.
+            1. Single image input::
+                {
+                    'image': torch.Size([3, 224, 224]),
+                }
+
+            2. Image and segmentation input::
+                {
+                    'image': torch.Size([3, 224, 224]),
+                    'segmentation': torch.Size([1, 224, 224]),
+                }
+
+            3. Left image, right image and disparity input::
+                {
+                    'left': torch.Size([3, 224, 224]),
+                    'right': torch.Size([3, 224, 224]),
+                    'disparity': torch.Size([1, 224, 224]),
+                }
+
+            4. Image, keypoints, and point cloud input::
+                {
+                    'image': torch.Size([3, 224, 224]),
+                    'keypoints': torch.Size([17, 2]),
+                    'point_cloud': torch.Size([20000, 3]),
+                }
+
+        @type: dict[str, Size]
         """
         ...
 
+    @property
+    def input_shape(self) -> Size:
+        """Shape (c, h, w) of the input tensor, WITHOUT batch dimension.
+
+        @type: torch.Size
+        """
+        return self.input_shapes[self.image_source]
+
     @abstractmethod
     def __len__(self) -> int:
         """Returns length of the dataset."""
@@ -106,11 +116,12 @@ def get_classes(self) -> dict[str, list[str]]:
         ...
 
     def get_n_keypoints(self) -> dict[str, int] | None:
-        """Returns the dictionary defining the semantic skeleton for each class using
-        keypoints.
+        """Returns the dictionary defining the semantic skeleton for
+        each class using keypoints.
 
         @rtype: Dict[str, Dict]
-        @return: A dictionary mapping classes to their skeleton definitions.
+        @return: A dictionary mapping classes to their skeleton
+            definitions.
         """
         return None
 
@@ -121,19 +132,21 @@ def collate_fn(
     """Default collate function used for training.
 
     @type batch: list[LuxonisLoaderTorchOutput]
-    @param batch: List of loader outputs (dict of Tensors) and labels (dict of Tensors)
-        in the LuxonisLoaderTorchOutput format.
+    @param batch: List of loader outputs (dict of Tensors) and labels
+        (dict of Tensors) in the LuxonisLoaderTorchOutput format.
     @rtype: tuple[dict[str, Tensor], dict[LabelType, Tensor]]
-    @return: Tuple of inputs and annotations in the format expected by the model.
+    @return: Tuple of inputs and annotations in the format expected by
+        the model.
     """
     inputs: tuple[dict[str, Tensor], ...]
     labels: tuple[Labels, ...]
     inputs, labels = zip(*batch)
 
-    out_inputs = {k: torch.stack([i[k] for i in inputs], 0) for k in inputs[0].keys()}
-    out_labels = {task: {} for task in labels[0].keys()}
+    out_inputs = {
+        k: torch.stack([i[k] for i in inputs], 0) for k in inputs[0].keys()
+    }
 
-    out_labels = {}
+    out_labels: Labels = {}
 
     for task in labels[0].keys():
         label_type = labels[0][task][1]
diff --git a/luxonis_train/utils/loaders/luxonis_loader_torch.py b/luxonis_train/loaders/luxonis_loader_torch.py
similarity index 98%
rename from luxonis_train/utils/loaders/luxonis_loader_torch.py
rename to luxonis_train/loaders/luxonis_loader_torch.py
index 328f87be..8286a7a2 100644
--- a/luxonis_train/utils/loaders/luxonis_loader_torch.py
+++ b/luxonis_train/loaders/luxonis_loader_torch.py
@@ -156,7 +156,9 @@ def _parse_dataset(
                 f"Supported types are: {', '.join(DatasetType.__members__)}."
             )
 
-        logger.info(f"Parsing dataset from {dataset_dir} with name '{dataset_name}'")
+        logger.info(
+            f"Parsing dataset from {dataset_dir} with name '{dataset_name}'"
+        )
 
         return LuxonisParser(
             dataset_dir,
diff --git a/luxonis_train/models/luxonis_lightning.py b/luxonis_train/models/luxonis_lightning.py
index a3671dac..2bbf8ca9 100644
--- a/luxonis_train/models/luxonis_lightning.py
+++ b/luxonis_train/models/luxonis_lightning.py
@@ -1,6 +1,7 @@
 from collections import defaultdict
 from collections.abc import Mapping
 from logging import getLogger
+from pathlib import Path
 from typing import Literal, cast
 
 import lightning.pytorch as pl
@@ -17,21 +18,32 @@
     BaseMetric,
     BaseVisualizer,
 )
-from luxonis_train.attached_modules.metrics.common import TorchMetricWrapper
+from luxonis_train.attached_modules.metrics.torchmetrics import (
+    TorchMetricWrapper,
+)
 from luxonis_train.attached_modules.visualizers import (
     combine_visualizations,
     get_unnormalized_images,
 )
-from luxonis_train.callbacks import (
-    BaseLuxonisProgressBar,
-    ModuleFreezer,
-)
+from luxonis_train.callbacks import BaseLuxonisProgressBar, ModuleFreezer
 from luxonis_train.nodes import BaseNode
+from luxonis_train.utils import (
+    DatasetMetadata,
+    Kwargs,
+    Labels,
+    LuxonisTrackerPL,
+    Packet,
+    to_shape_packet,
+    traverse_graph,
+)
 from luxonis_train.utils.config import AttachedModuleConfig, Config
-from luxonis_train.utils.general import DatasetMetadata, to_shape_packet, traverse_graph
-from luxonis_train.utils.registry import CALLBACKS, OPTIMIZERS, SCHEDULERS, Registry
-from luxonis_train.utils.tracker import LuxonisTrackerPL
-from luxonis_train.utils.types import Kwargs, Labels, Packet
+from luxonis_train.utils.graph import Graph
+from luxonis_train.utils.registry import (
+    CALLBACKS,
+    OPTIMIZERS,
+    SCHEDULERS,
+    Registry,
+)
 
 from .luxonis_output import LuxonisOutput
 
@@ -105,13 +117,13 @@ def __init__(
         @type save_dir: str
         @param save_dir: Directory to save checkpoints.
         @type input_shapes: dict[str, Size]
-        @param input_shapes: Dictionary of input shapes. Keys are input names, values
-            are shapes.
+        @param input_shapes: Dictionary of input shapes. Keys are input
+            names, values are shapes.
         @type dataset_metadata: L{DatasetMetadata} | None
         @param dataset_metadata: Dataset metadata.
         @type kwargs: Any
-        @param kwargs: Additional arguments to pass to the L{LightningModule}
-            constructor.
+        @param kwargs: Additional arguments to pass to the
+            L{LightningModule} constructor.
         """
         super().__init__(**kwargs)
 
@@ -123,18 +135,24 @@ def __init__(
         self.image_source = cfg.loader.image_source
         self.dataset_metadata = dataset_metadata or DatasetMetadata()
         self.frozen_nodes: list[tuple[nn.Module, int]] = []
-        self.graph: dict[str, list[str]] = {}
+        self.graph: Graph = {}
         self.loader_input_shapes: dict[str, dict[str, Size]] = {}
         self.node_input_sources: dict[str, list[str]] = defaultdict(list)
         self.loss_weights: dict[str, float] = {}
         self.main_metric: str | None = None
         self.save_dir = save_dir
         self.test_step_outputs: list[Mapping[str, Tensor | float | int]] = []
-        self.training_step_outputs: list[Mapping[str, Tensor | float | int]] = []
-        self.validation_step_outputs: list[Mapping[str, Tensor | float | int]] = []
+        self.training_step_outputs: list[
+            Mapping[str, Tensor | float | int]
+        ] = []
+        self.validation_step_outputs: list[
+            Mapping[str, Tensor | float | int]
+        ] = []
         self.losses: dict[str, dict[str, BaseLoss]] = defaultdict(dict)
         self.metrics: dict[str, dict[str, BaseMetric]] = defaultdict(dict)
-        self.visualizers: dict[str, dict[str, BaseVisualizer]] = defaultdict(dict)
+        self.visualizers: dict[str, dict[str, BaseVisualizer]] = defaultdict(
+            dict
+        )
 
         self._logged_images = 0
 
@@ -152,7 +170,9 @@ def __init__(
                 elif isinstance(node_cfg.freezing.unfreeze_after, int):
                     unfreeze_after = node_cfg.freezing.unfreeze_after
                 else:
-                    unfreeze_after = int(node_cfg.freezing.unfreeze_after * epochs)
+                    unfreeze_after = int(
+                        node_cfg.freezing.unfreeze_after * epochs
+                    )
                 frozen_nodes.append((node_name, unfreeze_after))
 
             if node_cfg.task is not None:
@@ -172,8 +192,14 @@ def __init__(
 
                     node_cfg.task = {next(iter(Node.tasks)): node_cfg.task}
                 else:
-                    node_cfg.task = {**Node._process_tasks(Node.tasks), **node_cfg.task}
-            nodes[node_name] = (Node, {**node_cfg.params, "_tasks": node_cfg.task})
+                    node_cfg.task = {
+                        **Node._process_tasks(Node.tasks),
+                        **node_cfg.task,
+                    }
+            nodes[node_name] = (
+                Node,
+                {**node_cfg.params, "_tasks": node_cfg.task},
+            )
 
             # Handle inputs for this node
             if node_cfg.input_sources:
@@ -241,7 +267,7 @@ def __init__(
     @property
     def core(self) -> "luxonis_train.core.LuxonisModel":
         """Returns the core model."""
-        if self._core is None:
+        if self._core is None:  # pragma: no cover
             raise ValueError("Core reference is not set.")
         return self._core
 
@@ -251,12 +277,12 @@ def _initiate_nodes(
     ) -> nn.ModuleDict:
         """Initializes all the nodes in the model.
 
-        Traverses the graph and initiates each node using outputs of the preceding
-        nodes.
+        Traverses the graph and initiates each node using outputs of the
+        preceding nodes.
 
         @type nodes: dict[str, tuple[type[LuxonisNode], Kwargs]]
-        @param nodes: Dictionary of nodes to be initiated. Keys are node names, values
-            are tuples of node class and node kwargs.
+        @param nodes: Dictionary of nodes to be initiated. Keys are node
+            names, values are tuples of node class and node kwargs.
         @rtype: L{nn.ModuleDict}[str, L{LuxonisNode}]
         @return: Dictionary of initiated nodes.
         """
@@ -268,9 +294,10 @@ def _initiate_nodes(
             for source_name, shape in shapes.items()
         }
 
-        for node_name, (Node, node_kwargs), node_input_names, _ in traverse_graph(
-            self.graph, nodes
-        ):
+        for node_name, (
+            Node,
+            node_kwargs,
+        ), node_input_names, _ in traverse_graph(self.graph, nodes):
             node_dummy_inputs: list[Packet[Tensor]] = []
             """List of dummy input packets for the node.
 
@@ -313,23 +340,27 @@ def forward(
     ) -> LuxonisOutput:
         """Forward pass of the model.
 
-        Traverses the graph and step-by-step computes the outputs of each node. Each
-        next node is computed only when all of its predecessors are computed. Once the
-        outputs are not needed anymore, they are removed from the memory.
+        Traverses the graph and step-by-step computes the outputs of
+        each node. Each next node is computed only when all of its
+        predecessors are computed. Once the outputs are not needed
+        anymore, they are removed from the memory.
 
         @type inputs: L{Tensor}
         @param inputs: Input tensor.
         @type task_labels: L{TaskLabels} | None
         @param task_labels: Labels dictionary. Defaults to C{None}.
         @type images: L{Tensor} | None
-        @param images: Canvas tensor for visualizers. Defaults to C{None}.
+        @param images: Canvas tensor for visualizers. Defaults to
+            C{None}.
         @type compute_loss: bool
-        @param compute_loss: Whether to compute losses. Defaults to C{True}.
+        @param compute_loss: Whether to compute losses. Defaults to
+            C{True}.
         @type compute_metrics: bool
-        @param compute_metrics: Whether to update metrics. Defaults to C{True}.
+        @param compute_metrics: Whether to update metrics. Defaults to
+            C{True}.
         @type compute_visualizations: bool
-        @param compute_visualizations: Whether to compute visualizations. Defaults to
-            C{False}.
+        @param compute_visualizations: Whether to compute
+            visualizations. Defaults to C{False}.
         @rtype: L{LuxonisOutput}
         @return: Output of the model.
         """
@@ -353,11 +384,19 @@ def forward(
             outputs = node.run(node_inputs)
             computed[node_name] = outputs
 
-            if compute_loss and node_name in self.losses and labels is not None:
+            if (
+                compute_loss
+                and node_name in self.losses
+                and labels is not None
+            ):
                 for loss_name, loss in self.losses[node_name].items():
                     losses[node_name][loss_name] = loss.run(outputs, labels)
 
-            if compute_metrics and node_name in self.metrics and labels is not None:
+            if (
+                compute_metrics
+                and node_name in self.metrics
+                and labels is not None
+            ):
                 for metric in self.metrics[node_name].values():
                     metric.run_update(outputs, labels)
 
@@ -367,7 +406,9 @@ def forward(
                 and images is not None
                 and labels is not None
             ):
-                for viz_name, visualizer in self.visualizers[node_name].items():
+                for viz_name, visualizer in self.visualizers[
+                    node_name
+                ].items():
                     viz = combine_visualizations(
                         visualizer.run(
                             images,
@@ -420,7 +461,7 @@ def compute_metrics(self) -> dict[str, dict[str, Tensor]]:
                         computed_submetrics = {metric_name: metric_value}
                     case dict(submetrics):
                         computed_submetrics = submetrics
-                    case unknown:
+                    case unknown:  # pragma: no cover
                         raise ValueError(
                             f"Metric {metric_name} returned unexpected value of "
                             f"type {type(unknown)}."
@@ -435,7 +476,8 @@ def export_onnx(self, save_path: str, **kwargs) -> list[str]:
         @type save_path: str
         @param save_path: Path where the exported model will be saved.
         @type kwargs: Any
-        @param kwargs: Additional arguments for the L{torch.onnx.export} method.
+        @param kwargs: Additional arguments for the L{torch.onnx.export}
+            method.
         @rtype: list[str]
         @return: List of output names.
         """
@@ -448,7 +490,8 @@ def export_onnx(self, save_path: str, **kwargs) -> list[str]:
         }
 
         inputs_deep_clone = {
-            k: torch.zeros(elem.shape).to(self.device) for k, elem in inputs.items()
+            k: torch.zeros(elem.shape).to(self.device)
+            for k, elem in inputs.items()
         }
 
         inputs_for_onnx = {"inputs": inputs_deep_clone}
@@ -519,22 +562,26 @@ def export_forward(inputs) -> tuple[Tensor, ...]:
 
     def process_losses(
         self,
-        losses_dict: dict[str, dict[str, Tensor | tuple[Tensor, dict[str, Tensor]]]],
+        losses_dict: dict[
+            str, dict[str, Tensor | tuple[Tensor, dict[str, Tensor]]]
+        ],
     ) -> tuple[Tensor, dict[str, Tensor]]:
         """Processes individual losses from the model run.
 
-        Goes over the computed losses and computes the final loss as a weighted sum of
-        all the losses.
+        Goes over the computed losses and computes the final loss as a
+        weighted sum of all the losses.
 
-        @type losses_dict: dict[str, dict[str, Tensor | tuple[Tensor, dict[str,
-            Tensor]]]]
-        @param losses_dict: Dictionary of computed losses. Each node can have multiple
-            losses attached. The first key identifies the node, the second key
-            identifies the specific loss. Values are either single tensors or tuples of
-            tensors and sublosses.
+        @type losses_dict: dict[str, dict[str, Tensor | tuple[Tensor,
+            dict[str, Tensor]]]]
+        @param losses_dict: Dictionary of computed losses. Each node can
+            have multiple losses attached. The first key identifies the
+            node, the second key identifies the specific loss. Values
+            are either single tensors or tuples of tensors and
+            sublosses.
         @rtype: tuple[Tensor, dict[str, Tensor]]
-        @return: Tuple of final loss and dictionary of processed sublosses. The
-            dictionary is in a format of {loss_name: loss_value}.
+        @return: Tuple of final loss and dictionary of processed
+            sublosses. The dictionary is in a format of {loss_name:
+            loss_value}.
         """
         final_loss = torch.zeros(1, device=self.device)
         training_step_output: dict[str, Tensor] = {}
@@ -548,9 +595,9 @@ def process_losses(
 
                 loss *= self.loss_weights[loss_name]
                 final_loss += loss
-                training_step_output[
-                    f"loss/{node_name}/{loss_name}"
-                ] = loss.detach().cpu()
+                training_step_output[f"loss/{node_name}/{loss_name}"] = (
+                    loss.detach().cpu()
+                )
                 if self.cfg.trainer.log_sub_losses and sublosses:
                     for subloss_name, subloss_value in sublosses.items():
                         training_step_output[
@@ -559,10 +606,14 @@ def process_losses(
         training_step_output["loss"] = final_loss.detach().cpu()
         return final_loss, training_step_output
 
-    def training_step(self, train_batch: tuple[dict[str, Tensor], Labels]) -> Tensor:
+    def training_step(
+        self, train_batch: tuple[dict[str, Tensor], Labels]
+    ) -> Tensor:
         """Performs one step of training with provided batch."""
         outputs = self.forward(*train_batch)
-        assert outputs.losses, "Losses are empty, check if you have defined any loss"
+        assert (
+            outputs.losses
+        ), "Losses are empty, check if you have defined any loss"
 
         loss, training_step_output = self.process_losses(outputs.losses)
         self.training_step_outputs.append(training_step_output)
@@ -605,7 +656,8 @@ def get_status(self) -> tuple[int, int]:
         return self.current_epoch, self.cfg.trainer.epochs
 
     def get_status_percentage(self) -> float:
-        """Returns percentage of current training, takes into account early stopping."""
+        """Returns percentage of current training, takes into account
+        early stopping."""
         if self._trainer.early_stopping_callback:
             # model haven't yet stop from early stopping callback
             if self._trainer.early_stopping_callback.stopped_epoch == 0:
@@ -616,11 +668,13 @@ def get_status_percentage(self) -> float:
             return (self.current_epoch / self.cfg.trainer.epochs) * 100
 
     def _evaluation_step(
-        self, mode: Literal["test", "val"], batch: tuple[dict[str, Tensor], Labels]
+        self,
+        mode: Literal["test", "val"],
+        batch: tuple[dict[str, Tensor], Labels],
     ) -> dict[str, Tensor]:
         inputs, labels = batch
         images = None
-        if self._logged_images < self.cfg.trainer.num_log_images:
+        if self._logged_images < self.cfg.trainer.n_log_images:
             images = get_unnormalized_images(self.cfg, inputs)
         outputs = self.forward(
             inputs,
@@ -638,7 +692,7 @@ def _evaluation_step(
             for viz_name, viz_batch in visualizations.items():
                 logged_images = self._logged_images
                 for viz in viz_batch:
-                    if logged_images >= self.cfg.trainer.num_log_images:
+                    if logged_images >= self.cfg.trainer.n_log_images:
                         break
                     self.logger.log_image(
                         f"{mode}/visualizations/{node_name}/{viz_name}/{logged_images}",
@@ -662,7 +716,9 @@ def _evaluation_epoch_end(self, mode: Literal["test", "val"]) -> None:
         logger.info("Metrics computed.")
         for node_name, metrics in computed_metrics.items():
             for metric_name, metric_value in metrics.items():
-                metric_results[node_name][metric_name] = metric_value.cpu().item()
+                metric_results[node_name][metric_name] = (
+                    metric_value.cpu().item()
+                )
                 self.log(
                     f"{mode}/metric/{node_name}/{metric_name}",
                     metric_value,
@@ -682,7 +738,9 @@ def _evaluation_epoch_end(self, mode: Literal["test", "val"]) -> None:
     def configure_callbacks(self) -> list[pl.Callback]:
         """Configures Pytorch Lightning callbacks."""
         self.min_val_loss_checkpoints_path = f"{self.save_dir}/min_val_loss"
-        self.best_val_metric_checkpoints_path = f"{self.save_dir}/best_val_metric"
+        self.best_val_metric_checkpoints_path = (
+            f"{self.save_dir}/best_val_metric"
+        )
         model_name = self.cfg.model.name
 
         callbacks: list[pl.Callback] = [
@@ -716,14 +774,17 @@ def configure_callbacks(self) -> list[pl.Callback]:
 
         for callback in self.cfg.trainer.callbacks:
             if callback.active:
-                callbacks.append(CALLBACKS.get(callback.name)(**callback.params))
+                callbacks.append(
+                    CALLBACKS.get(callback.name)(**callback.params)
+                )
 
         return callbacks
 
     def configure_optimizers(
         self,
     ) -> tuple[
-        list[torch.optim.Optimizer], list[torch.optim.lr_scheduler._LRScheduler]
+        list[torch.optim.Optimizer],
+        list[torch.optim.lr_scheduler._LRScheduler],
     ]:
         """Configures model optimizers and schedulers."""
         cfg_optimizer = self.cfg.trainer.optimizer
@@ -739,18 +800,20 @@ def configure_optimizers(
 
         return [optimizer], [scheduler]
 
-    def load_checkpoint(self, path: str | None) -> None:
+    def load_checkpoint(self, path: str | Path | None) -> None:
         """Loads checkpoint weights from provided path.
 
-        Loads the checkpoints gracefully, ignoring keys that are not found in the model
-        state dict or in the checkpoint.
+        Loads the checkpoints gracefully, ignoring keys that are not
+        found in the model state dict or in the checkpoint.
 
         @type path: str | None
-        @param path: Path to the checkpoint. If C{None}, no checkpoint will be loaded.
+        @param path: Path to the checkpoint. If C{None}, no checkpoint
+            will be loaded.
         """
         if path is None:
             return
 
+        path = str(path)
         checkpoint = torch.load(path, map_location=self.device)
 
         if "state_dict" not in checkpoint:
@@ -809,7 +872,9 @@ def _init_attached_module(
         return module_name, node_name
 
     @staticmethod
-    def _to_module_dict(modules: dict[str, dict[str, nn.Module]]) -> nn.ModuleDict:
+    def _to_module_dict(
+        modules: dict[str, dict[str, nn.Module]],
+    ) -> nn.ModuleDict:
         return nn.ModuleDict(
             {
                 node_name: nn.ModuleDict(node_modules)
@@ -819,7 +884,9 @@ def _to_module_dict(modules: dict[str, dict[str, nn.Module]]) -> nn.ModuleDict:
 
     @property
     def _progress_bar(self) -> BaseLuxonisProgressBar:
-        return cast(BaseLuxonisProgressBar, self._trainer.progress_bar_callback)
+        return cast(
+            BaseLuxonisProgressBar, self._trainer.progress_bar_callback
+        )
 
     @rank_zero_only
     def _print_results(
@@ -829,16 +896,20 @@ def _print_results(
 
         logger.info(f"{stage} loss: {loss:.4f}")
 
-        self._progress_bar.print_results(stage=stage, loss=loss, metrics=metrics)
+        self._progress_bar.print_results(
+            stage=stage, loss=loss, metrics=metrics
+        )
 
         if self.main_metric is not None:
             main_metric_node, main_metric_name = self.main_metric.split("/")
             main_metric = metrics[main_metric_node][main_metric_name]
-            logger.info(f"{stage} main metric ({self.main_metric}): {main_metric:.4f}")
+            logger.info(
+                f"{stage} main metric ({self.main_metric}): {main_metric:.4f}"
+            )
 
     def _is_train_eval_epoch(self) -> bool:
-        """Checks if train eval should be performed on current epoch based on configured
-        train_metrics_interval."""
+        """Checks if train eval should be performed on current epoch
+        based on configured train_metrics_interval."""
         train_metrics_interval = self.cfg.trainer.train_metrics_interval
         # add +1 to current_epoch because starting epoch is at 0
         return (
diff --git a/luxonis_train/models/luxonis_output.py b/luxonis_train/models/luxonis_output.py
index d69943fc..3cf59329 100644
--- a/luxonis_train/models/luxonis_output.py
+++ b/luxonis_train/models/luxonis_output.py
@@ -3,8 +3,7 @@
 
 from torch import Tensor
 
-from luxonis_train.utils.general import to_shape_packet
-from luxonis_train.utils.types import Packet
+from luxonis_train.utils import Packet, to_shape_packet
 
 
 @dataclass
diff --git a/luxonis_train/models/predefined_models/base_predefined_model.py b/luxonis_train/models/predefined_models/base_predefined_model.py
index 33ababdc..9388f345 100644
--- a/luxonis_train/models/predefined_models/base_predefined_model.py
+++ b/luxonis_train/models/predefined_models/base_predefined_model.py
@@ -1,4 +1,4 @@
-from abc import ABC, abstractproperty
+from abc import ABC, abstractmethod
 
 from luxonis_ml.utils.registry import AutoRegisterMeta
 
@@ -17,21 +17,21 @@ class BasePredefinedModel(
     registry=MODELS,
     register=False,
 ):
-    @abstractproperty
-    def nodes(self) -> list[ModelNodeConfig]:
-        ...
+    @property
+    @abstractmethod
+    def nodes(self) -> list[ModelNodeConfig]: ...
 
-    @abstractproperty
-    def losses(self) -> list[LossModuleConfig]:
-        ...
+    @property
+    @abstractmethod
+    def losses(self) -> list[LossModuleConfig]: ...
 
-    @abstractproperty
-    def metrics(self) -> list[MetricModuleConfig]:
-        ...
+    @property
+    @abstractmethod
+    def metrics(self) -> list[MetricModuleConfig]: ...
 
-    @abstractproperty
-    def visualizers(self) -> list[AttachedModuleConfig]:
-        ...
+    @property
+    @abstractmethod
+    def visualizers(self) -> list[AttachedModuleConfig]: ...
 
     def generate_model(
         self,
diff --git a/luxonis_train/models/predefined_models/classification_model.py b/luxonis_train/models/predefined_models/classification_model.py
index c9d782eb..e390b667 100644
--- a/luxonis_train/models/predefined_models/classification_model.py
+++ b/luxonis_train/models/predefined_models/classification_model.py
@@ -1,13 +1,13 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+from luxonis_train.utils import Kwargs
 from luxonis_train.utils.config import (
     AttachedModuleConfig,
     LossModuleConfig,
     MetricModuleConfig,
     ModelNodeConfig,
 )
-from luxonis_train.utils.types import Kwargs
 
 from .base_predefined_model import BasePredefinedModel
 
@@ -15,7 +15,7 @@
 @dataclass
 class ClassificationModel(BasePredefinedModel):
     backbone: str = "MicroNet"
-    task: Literal["multiclass", "multilabel"] = "multilabel"
+    task: Literal["multiclass", "multilabel"] = "multiclass"
     backbone_params: Kwargs = field(default_factory=dict)
     head_params: Kwargs = field(default_factory=dict)
     loss_params: Kwargs = field(default_factory=dict)
diff --git a/luxonis_train/models/predefined_models/detection_model.py b/luxonis_train/models/predefined_models/detection_model.py
index e9db4462..94c4487f 100644
--- a/luxonis_train/models/predefined_models/detection_model.py
+++ b/luxonis_train/models/predefined_models/detection_model.py
@@ -1,12 +1,12 @@
 from dataclasses import dataclass, field
 
+from luxonis_train.utils import Kwargs
 from luxonis_train.utils.config import (
     AttachedModuleConfig,
     LossModuleConfig,
     MetricModuleConfig,
     ModelNodeConfig,
 )
-from luxonis_train.utils.types import Kwargs
 
 from .base_predefined_model import BasePredefinedModel
 
@@ -47,7 +47,9 @@ def nodes(self) -> list[ModelNodeConfig]:
                 name="EfficientBBoxHead",
                 alias="detection_head",
                 freezing=self.head_params.pop("freezing", {}),
-                inputs=["detection_neck"] if self.use_neck else ["detection_backbone"],
+                inputs=["detection_neck"]
+                if self.use_neck
+                else ["detection_backbone"],
                 params=self.head_params,
                 task=self.task_name,
             )
diff --git a/luxonis_train/models/predefined_models/keypoint_detection_model.py b/luxonis_train/models/predefined_models/keypoint_detection_model.py
index 588911c6..670b00b1 100644
--- a/luxonis_train/models/predefined_models/keypoint_detection_model.py
+++ b/luxonis_train/models/predefined_models/keypoint_detection_model.py
@@ -1,13 +1,13 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+from luxonis_train.utils import Kwargs
 from luxonis_train.utils.config import (
     AttachedModuleConfig,
     LossModuleConfig,
     MetricModuleConfig,
     ModelNodeConfig,
 )
-from luxonis_train.utils.types import Kwargs
 
 from .base_predefined_model import BasePredefinedModel
 
@@ -21,7 +21,7 @@ class KeypointDetectionModel(BasePredefinedModel):
     loss_params: Kwargs = field(default_factory=dict)
     head_type: Literal[
         "ImplicitKeypointBBoxHead", "EfficientKeypointBBoxHead"
-    ] = "ImplicitKeypointBBoxHead"
+    ] = "EfficientKeypointBBoxHead"
     kpt_visualizer_params: Kwargs = field(default_factory=dict)
     bbox_visualizer_params: Kwargs = field(default_factory=dict)
     bbox_task_name: str | None = None
@@ -50,7 +50,7 @@ def nodes(self) -> list[ModelNodeConfig]:
 
         task = {}
         if self.bbox_task_name is not None:
-            task["bbox"] = self.bbox_task_name
+            task["boundingbox"] = self.bbox_task_name
         if self.kpt_task_name is not None:
             task["keypoints"] = self.kpt_task_name
 
diff --git a/luxonis_train/models/predefined_models/segmentation_model.py b/luxonis_train/models/predefined_models/segmentation_model.py
index b5e81f76..d1076239 100644
--- a/luxonis_train/models/predefined_models/segmentation_model.py
+++ b/luxonis_train/models/predefined_models/segmentation_model.py
@@ -1,13 +1,13 @@
 from dataclasses import dataclass, field
 from typing import Literal
 
+from luxonis_train.utils import Kwargs
 from luxonis_train.utils.config import (
     AttachedModuleConfig,
     LossModuleConfig,
     MetricModuleConfig,
     ModelNodeConfig,
 )
-from luxonis_train.utils.types import Kwargs
 
 from .base_predefined_model import BasePredefinedModel
 
diff --git a/luxonis_train/nodes/README.md b/luxonis_train/nodes/README.md
index 2f147e23..60e5971c 100644
--- a/luxonis_train/nodes/README.md
+++ b/luxonis_train/nodes/README.md
@@ -77,7 +77,7 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 | Key           | Type        | Default value               | Description                                         |
 | ------------- | ----------- | --------------------------- | --------------------------------------------------- |
 | channels_list | List\[int\] | \[64, 128, 256, 512, 1024\] | List of number of channels for each block           |
-| num_repeats   | List\[int\] | \[1, 6, 12, 18, 6\]         | List of number of repeats of RepVGGBlock            |
+| n_repeats     | List\[int\] | \[1, 6, 12, 18, 6\]         | List of number of repeats of RepVGGBlock            |
 | in_channels   | int         | 3                           | Number of input channels, should be 3 in most cases |
 | depth_mul     | int         | 0.33                        | Depth multiplier                                    |
 | width_mul     | int         | 0.25                        | Width multiplier                                    |
@@ -145,9 +145,9 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 | Key           | Type             | Default value                                           | Description                               |
 | ------------- | ---------------- | ------------------------------------------------------- | ----------------------------------------- |
-| num_heads     | Literal\[2,3,4\] | 3 ***Note:** Should be same also on head in most cases* | Number of output heads                    |
+| n_heads       | Literal\[2,3,4\] | 3 ***Note:** Should be same also on head in most cases* | Number of output heads                    |
 | channels_list | List\[int\]      | \[256, 128, 128, 256, 256, 512\]                        | List of number of channels for each block |
-| num_repeats   | List\[int\]      | \[12, 12, 12, 12\]                                      | List of number of repeats of RepVGGBlock  |
+| n_repeats     | List\[int\]      | \[12, 12, 12, 12\]                                      | List of number of repeats of RepVGGBlock  |
 | depth_mul     | int              | 0.33                                                    | Depth multiplier                          |
 | width_mul     | int              | 0.25                                                    | Width multiplier                          |
 
@@ -182,7 +182,7 @@ Adapted from [here](https://arxiv.org/pdf/2209.02976.pdf).
 
 | Key        | Type  | Default value | Description                                        |
 | ---------- | ----- | ------------- | -------------------------------------------------- |
-| num_heads  | bool  | 3             | Number of output heads                             |
+| n_heads    | bool  | 3             | Number of output heads                             |
 | conf_thres | float | 0.25          | confidence threshold for nms (used for evaluation) |
 | iou_thres  | float | 0.45          | iou threshold for nms (used for evaluation)        |
 
@@ -195,7 +195,7 @@ Adapted from [here](https://arxiv.org/pdf/2207.02696.pdf).
 | Key              | Type                        | Default value | Description                                                                                                |
 | ---------------- | --------------------------- | ------------- | ---------------------------------------------------------------------------------------------------------- |
 | n_keypoints      | int \| None                 | None          | Number of keypoints.                                                                                       |
-| num_heads        | int                         | 3             | Number of output heads                                                                                     |
+| n_heads          | int                         | 3             | Number of output heads                                                                                     |
 | anchors          | List\[List\[int\]\] \| None | None          | Anchors used for object detection. If set to `None`, the anchors are computed at runtime from the dataset. |
 | init_coco_biases | bool                        | True          | Whether to use COCO bias and weight initialization                                                         |
 | conf_thres       | float                       | 0.25          | confidence threshold for nms (used for evaluation)                                                         |
diff --git a/luxonis_train/nodes/activations/__init__.py b/luxonis_train/nodes/activations/__init__.py
index 37aea0fc..0d3d1e0b 100644
--- a/luxonis_train/nodes/activations/__init__.py
+++ b/luxonis_train/nodes/activations/__init__.py
@@ -1,3 +1,3 @@
-from .activations import HSigmoid, HSwish
+from .activations import HSigmoid
 
-__all__ = ["HSigmoid", "HSwish"]
+__all__ = ["HSigmoid"]
diff --git a/luxonis_train/nodes/activations/activations.py b/luxonis_train/nodes/activations/activations.py
index f3abedd6..93703a1c 100644
--- a/luxonis_train/nodes/activations/activations.py
+++ b/luxonis_train/nodes/activations/activations.py
@@ -10,14 +10,3 @@ def __init__(self):
 
     def forward(self, x: Tensor) -> Tensor:
         return self.relu(x + 3) / 6
-
-
-class HSwish(nn.Module):
-    def __init__(self):
-        """H-Swish activation function from U{Searching for MobileNetV3
-        <https://arxiv.org/abs/1905.02244>}."""
-        super().__init__()
-        self.sigmoid = HSigmoid()
-
-    def forward(self, x: Tensor) -> Tensor:
-        return x * self.sigmoid(x)
diff --git a/luxonis_train/nodes/backbones/contextspatial.py b/luxonis_train/nodes/backbones/contextspatial.py
index 2cac4b81..cf98cd4c 100644
--- a/luxonis_train/nodes/backbones/contextspatial.py
+++ b/luxonis_train/nodes/backbones/contextspatial.py
@@ -1,9 +1,3 @@
-"""Implementation of Context Spatial backbone.
-
-Source: U{BiseNetV1<https://github.com/taveraantonio/BiseNetv1>}
-"""
-
-
 from torch import Tensor, nn
 from torch.nn import functional as F
 
@@ -13,21 +7,43 @@
     ConvModule,
     FeatureFusionBlock,
 )
+from luxonis_train.utils import Kwargs
 from luxonis_train.utils.registry import NODES
 
 
 class ContextSpatial(BaseNode[Tensor, list[Tensor]]):
-    def __init__(self, context_backbone: str = "MobileNetV2", **kwargs):
-        """Context spatial backbone.
-        TODO: Add more documentation.
+    def __init__(
+        self,
+        context_backbone: str | nn.Module = "MobileNetV2",
+        backbone_kwargs: Kwargs | None = None,
+        **kwargs,
+    ):
+        """Context Spatial backbone introduced in BiseNetV1.
 
+        Source: U{BiseNetV1<https://github.com/taveraantonio/BiseNetv1>}
+
+        @see: U{BiseNetv1: Bilateral Segmentation Network for
+            Real-time Semantic Segmentation
+            <https://arxiv.org/abs/1808.00897>}
 
         @type context_backbone: str
-        @param context_backbone: Backbone used. Defaults to C{MobileNetV2}.
+        @param context_backbone: Backbone used in the context path.
+            Can be either a string or a C{torch.nn.Module}.
+            If a string argument is used, it has to be a name of a module
+            stored in the L{NODES} registry. Defaults to C{MobileNetV2}.
+
+        @type backbone_kwargs: dict
+        @param backbone_kwargs: Keyword arguments for the backbone.
+            Only used when the C{context_backbone} argument is a string.
         """
         super().__init__(**kwargs)
 
-        self.context_path = ContextPath(NODES.get(context_backbone)(**kwargs))
+        if isinstance(context_backbone, str):
+            backbone_kwargs = backbone_kwargs or {}
+            backbone_kwargs |= kwargs
+            context_backbone = NODES.get(context_backbone)(**backbone_kwargs)
+
+        self.context_path = ContextPath(context_backbone)
         self.spatial_path = SpatialPath(3, 128)
         self.ffm = FeatureFusionBlock(256, 256)
 
@@ -35,22 +51,41 @@ def forward(self, inputs: Tensor) -> list[Tensor]:
         spatial_out = self.spatial_path(inputs)
         context16, _ = self.context_path(inputs)
         fm_fuse = self.ffm(spatial_out, context16)
-        outs = [fm_fuse]
-        return outs
+        return [fm_fuse]
 
 
 class SpatialPath(nn.Module):
     def __init__(self, in_channels: int, out_channels: int):
         super().__init__()
         intermediate_channels = 64
-        self.conv_7x7 = ConvModule(in_channels, intermediate_channels, 7, 2, 3)
+        self.conv_7x7 = ConvModule(
+            in_channels,
+            intermediate_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+        )
         self.conv_3x3_1 = ConvModule(
-            intermediate_channels, intermediate_channels, 3, 2, 1
+            intermediate_channels,
+            intermediate_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
         )
         self.conv_3x3_2 = ConvModule(
-            intermediate_channels, intermediate_channels, 3, 2, 1
+            intermediate_channels,
+            intermediate_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.conv_1x1 = ConvModule(
+            intermediate_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
         )
-        self.conv_1x1 = ConvModule(intermediate_channels, out_channels, 1, 1, 0)
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.conv_7x7(x)
@@ -60,25 +95,30 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class ContextPath(nn.Module):
-    def __init__(self, backbone: BaseNode):
+    def __init__(self, backbone: nn.Module):
         super().__init__()
         self.backbone = backbone
 
-        self.up16 = nn.Upsample(scale_factor=2.0, mode="bilinear", align_corners=True)
-        self.up32 = nn.Upsample(scale_factor=2.0, mode="bilinear", align_corners=True)
+        self.up16 = nn.Upsample(
+            scale_factor=2.0, mode="bilinear", align_corners=True
+        )
+        self.up32 = nn.Upsample(
+            scale_factor=2.0, mode="bilinear", align_corners=True
+        )
 
         self.refine16 = ConvModule(128, 128, 3, 1, 1)
         self.refine32 = ConvModule(128, 128, 3, 1, 1)
 
-    def forward(self, x: Tensor) -> list[Tensor]:
-        *_, down16, down32 = self.backbone.forward(x)
+    def forward(self, x: Tensor) -> tuple[Tensor, Tensor]:
+        *_, down16, down32 = self.backbone(x)
 
         if not hasattr(self, "arm16"):
             self.arm16 = AttentionRefinmentBlock(down16.shape[1], 128)
             self.arm32 = AttentionRefinmentBlock(down32.shape[1], 128)
 
             self.global_context = nn.Sequential(
-                nn.AdaptiveAvgPool2d(1), ConvModule(down32.shape[1], 128, 1, 1, 0)
+                nn.AdaptiveAvgPool2d(1),
+                ConvModule(down32.shape[1], 128, 1, 1, 0),
             )
 
         arm_down16 = self.arm16(down16)
@@ -86,15 +126,18 @@ def forward(self, x: Tensor) -> list[Tensor]:
 
         global_down32 = self.global_context(down32)
         global_down32 = F.interpolate(
-            global_down32, size=down32.size()[2:], mode="bilinear", align_corners=True
+            global_down32,
+            size=down32.shape[2:],
+            mode="bilinear",
+            align_corners=True,
         )
 
-        arm_down32 = arm_down32 + global_down32
+        arm_down32 += global_down32
         arm_down32 = self.up32(arm_down32)
         arm_down32 = self.refine32(arm_down32)
 
-        arm_down16 = arm_down16 + arm_down32
+        arm_down16 += arm_down32
         arm_down16 = self.up16(arm_down16)
         arm_down16 = self.refine16(arm_down16)
 
-        return [arm_down16, arm_down32]
+        return arm_down16, arm_down32
diff --git a/luxonis_train/nodes/backbones/efficientnet.py b/luxonis_train/nodes/backbones/efficientnet.py
index e560bc5f..7744236a 100644
--- a/luxonis_train/nodes/backbones/efficientnet.py
+++ b/luxonis_train/nodes/backbones/efficientnet.py
@@ -1,8 +1,4 @@
-"""Implementation of the EfficientNet backbone.
-
-Source: U{https://github.com/rwightman/gen-efficientnet-pytorch}
-@license: U{Apache 2.0<https://github.com/rwightman/gen-efficientnet-pytorch/blob/master/LICENSE>}
-"""
+from typing import Any
 
 import torch
 from torch import Tensor, nn
@@ -13,33 +9,49 @@
 class EfficientNet(BaseNode[Tensor, list[Tensor]]):
     attach_index: int = -1
 
-    def __init__(self, download_weights: bool = False, **kwargs):
+    def __init__(
+        self,
+        download_weights: bool = False,
+        out_indices: list[int] | None = None,
+        **kwargs: Any,
+    ):
         """EfficientNet backbone.
 
+        EfficientNet is a convolutional neural network architecture and scaling method that uniformly scales all dimensions of depth/width/resolution using a compound coefficient. Unlike conventional practice that arbitrary scales these factors, the EfficientNet scaling method uniformly scales network width, depth, and resolution with a set of fixed scaling coefficients.
+
+        Source: U{https://github.com/rwightman/gen-efficientnet-pytorch}
+
+        @license: U{Apache License, Version 2.0
+            <https://github.com/rwightman/gen-efficientnet-pytorch/blob/master/LICENSE>}
+
+        @see: U{https://paperswithcode.com/method/efficientnet}
+        @see: U{EfficientNet: Rethinking Model Scaling for
+            Convolutional Neural Networks
+            <https://arxiv.org/abs/1905.11946>}
         @type download_weights: bool
         @param download_weights: If C{True} download weights from imagenet. Defaults to
             C{False}.
+        @type out_indices: list[int] | None
+        @param out_indices: Indices of the output layers. Defaults to [0, 1, 2, 4, 6].
         """
         super().__init__(**kwargs)
 
-        efficientnet_lite0_model = torch.hub.load(
+        self.backbone: nn.Module = torch.hub.load(  # type: ignore
             "rwightman/gen-efficientnet-pytorch",
             "efficientnet_lite0",
             pretrained=download_weights,
         )
-        efficientnet_lite0_model.classifier = nn.Identity()
-        self.out_indices = [0, 1, 2, 4, 6]
-        efficientnet_lite0_model.bn2 = nn.Identity()
-        efficientnet_lite0_model.conv_head = nn.Identity()
-        self.backbone = efficientnet_lite0_model
-
-    def forward(self, x: Tensor) -> list[Tensor]:
-        outs = []
-        x = self.backbone.conv_stem(x)
+        self.out_indices = out_indices or [0, 1, 2, 4, 6]
+
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        x = self.backbone.conv_stem(inputs)
         x = self.backbone.bn1(x)
         x = self.backbone.act1(x)
-        for i, m in enumerate(self.backbone.blocks):
-            x = m(x)
+
+        outs: list[Tensor] = []
+
+        for i, layer in enumerate(self.backbone.blocks):
+            x = layer(x)
             if i in self.out_indices:
                 outs.append(x)
 
diff --git a/luxonis_train/nodes/backbones/efficientrep/__init__.py b/luxonis_train/nodes/backbones/efficientrep/__init__.py
new file mode 100644
index 00000000..51ff264a
--- /dev/null
+++ b/luxonis_train/nodes/backbones/efficientrep/__init__.py
@@ -0,0 +1,3 @@
+from .efficientrep import EfficientRep
+
+__all__ = ["EfficientRep"]
diff --git a/luxonis_train/nodes/backbones/efficientrep.py b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
similarity index 53%
rename from luxonis_train/nodes/backbones/efficientrep.py
rename to luxonis_train/nodes/backbones/efficientrep/efficientrep.py
index be558620..0143855c 100644
--- a/luxonis_train/nodes/backbones/efficientrep.py
+++ b/luxonis_train/nodes/backbones/efficientrep/efficientrep.py
@@ -1,11 +1,5 @@
-"""Implementation of the EfficientRep backbone.
-
-Adapted from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial
-Applications<https://arxiv.org/pdf/2209.02976.pdf>}.
-"""
-
 import logging
-from typing import Literal
+from typing import Any
 
 from torch import Tensor, nn
 
@@ -15,63 +9,68 @@
     RepVGGBlock,
     SpatialPyramidPoolingBlock,
 )
-from luxonis_train.utils.general import make_divisible
+from luxonis_train.utils import make_divisible
+
+from .variants import VariantLiteral, get_variant
 
 logger = logging.getLogger(__name__)
 
 
 class EfficientRep(BaseNode[Tensor, list[Tensor]]):
+    in_channels: int
+
     def __init__(
         self,
-        variant: Literal["s", "n", "m", "l"] = "n",
+        variant: VariantLiteral = "nano",
         channels_list: list[int] | None = None,
-        num_repeats: list[int] | None = None,
-        depth_mul: float = 0.33,
-        width_mul: float = 0.25,
-        **kwargs,
+        n_repeats: list[int] | None = None,
+        depth_mul: float | None = None,
+        width_mul: float | None = None,
+        **kwargs: Any,
     ):
-        """EfficientRep backbone.
-
-        @type variant: Literal["s", "n", "m", "l"]
-        @param variant: EfficientRep variant. Defaults to "n".
+        """Implementation of the EfficientRep backbone.
+
+        Adapted from U{YOLOv6: A Single-Stage Object Detection Framework
+        for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
+
+        @type variant: Literal["n", "nano", "s", "small", "m", "medium", "l", "large"]
+        @param variant: EfficientRep variant. Defaults to "nano".
+            The variant determines the depth and width multipliers.
+            The depth multiplier determines the number of blocks in each stage and the width multiplier determines the number of channels.
+            The following variants are available:
+                - "n" or "nano" (default): depth_multiplier=0.33, width_multiplier=0.25
+                - "s" or "small": depth_multiplier=0.33, width_multiplier=0.50
+                - "m" or "medium": depth_multiplier=0.60, width_multiplier=0.75
+                - "l" or "large": depth_multiplier=1.0, width_multiplier=1.0
         @type channels_list: list[int] | None
         @param channels_list: List of number of channels for each block. If unspecified,
             defaults to [64, 128, 256, 512, 1024].
-        @type num_repeats: list[int] | None
-        @param num_repeats: List of number of repeats of RepVGGBlock. If unspecified,
+        @type n_repeats: list[int] | None
+        @param n_repeats: List of number of repeats of RepVGGBlock. If unspecified,
             defaults to [1, 6, 12, 18, 6].
         @type depth_mul: float
-        @param depth_mul: Depth multiplier. Depending on the variant, defaults to 0.33.
+        @param depth_mul: Depth multiplier. If provided, overrides the variant value.
         @type width_mul: float
-        @param width_mul: Width multiplier. Depending on the variant, defaults to 0.25.
-        @type kwargs: Any
-        @param kwargs: Additional arguments to pass to L{BaseNode}.
+        @param width_mul: Width multiplier. If provided, overrides the variant value.
         """
         super().__init__(**kwargs)
 
-        if variant not in EFFICIENTREP_VARIANTS:
-            raise ValueError(
-                f"EfficientRep model variant should be in {list(EFFICIENTREP_VARIANTS.keys())}"
-            )
-
-        (
-            depth_mul,
-            width_mul,
-        ) = EFFICIENTREP_VARIANTS[variant]
+        var = get_variant(variant)
+        depth_mul = depth_mul or var.depth_multiplier
+        width_mul = width_mul or var.width_multiplier
 
         channels_list = channels_list or [64, 128, 256, 512, 1024]
-        num_repeats = num_repeats or [1, 6, 12, 18, 6]
-        channels_list = [make_divisible(i * width_mul, 8) for i in channels_list]
-        num_repeats = [
-            (max(round(i * depth_mul), 1) if i > 1 else i) for i in num_repeats
+        n_repeats = n_repeats or [1, 6, 12, 18, 6]
+        channels_list = [
+            make_divisible(i * width_mul, 8) for i in channels_list
+        ]
+        n_repeats = [
+            (max(round(i * depth_mul), 1) if i > 1 else i) for i in n_repeats
         ]
-
-        in_channels = self.in_channels
-        if not isinstance(in_channels, int):
-            raise ValueError("EfficientRep module expects only one input.")
 
         self.repvgg_encoder = RepVGGBlock(
-            in_channels=in_channels,
+            in_channels=self.in_channels,
             out_channels=channels_list[0],
             kernel_size=3,
             stride=2,
@@ -90,7 +89,7 @@ def __init__(
                     block=RepVGGBlock,
                     in_channels=channels_list[i + 1],
                     out_channels=channels_list[i + 1],
-                    num_blocks=num_repeats[i + 1],
+                    n_blocks=n_repeats[i + 1],
                 ),
             )
             self.blocks.append(curr_block)
@@ -107,27 +106,20 @@ def set_export_mode(self, mode: bool = True) -> None:
         """Reparametrizes instances of L{RepVGGBlock} in the network.
 
         @type mode: bool
-        @param mode: Whether to set the export mode. Defaults to C{True}.
+        @param mode: Whether to set the export mode. Defaults to
+            C{True}.
         """
         super().set_export_mode(mode)
         if self.export:
-            logger.info("Reparametrizing EfficientRep.")
+            logger.info("Reparametrizing 'EfficientRep'.")
             for module in self.modules():
                 if isinstance(module, RepVGGBlock):
                     module.reparametrize()
 
     def forward(self, inputs: Tensor) -> list[Tensor]:
-        outputs = []
+        outputs: list[Tensor] = []
         x = self.repvgg_encoder(inputs)
         for block in self.blocks:
             x = block(x)
             outputs.append(x)
         return outputs
-
-
-EFFICIENTREP_VARIANTS = {
-    "n": (0.33, 0.25),
-    "s": (0.33, 0.50),
-    "m": (0.60, 0.75),
-    "l": (1.0, 1.0),
-}
diff --git a/luxonis_train/nodes/backbones/efficientrep/variants.py b/luxonis_train/nodes/backbones/efficientrep/variants.py
new file mode 100644
index 00000000..7ced749e
--- /dev/null
+++ b/luxonis_train/nodes/backbones/efficientrep/variants.py
@@ -0,0 +1,44 @@
+from typing import Literal, TypeAlias
+
+from pydantic import BaseModel
+
+VariantLiteral: TypeAlias = Literal[
+    "n", "nano", "s", "small", "m", "medium", "l", "large"
+]
+
+
+class EfficientRepVariant(BaseModel):
+    depth_multiplier: float
+    width_multiplier: float
+
+
+def get_variant(variant: VariantLiteral) -> EfficientRepVariant:
+    variants = {
+        "n": EfficientRepVariant(
+            depth_multiplier=0.33,
+            width_multiplier=0.25,
+        ),
+        "s": EfficientRepVariant(
+            depth_multiplier=0.33,
+            width_multiplier=0.50,
+        ),
+        "m": EfficientRepVariant(
+            depth_multiplier=0.60,
+            width_multiplier=0.75,
+        ),
+        "l": EfficientRepVariant(
+            depth_multiplier=1.0,
+            width_multiplier=1.0,
+        ),
+    }
+    variants["nano"] = variants["n"]
+    variants["small"] = variants["s"]
+    variants["medium"] = variants["m"]
+    variants["large"] = variants["l"]
+
+    if variant not in variants:  # pragma: no cover
+        raise ValueError(
+            f"EfficientRep variant should be one of "
+            f"{list(variants.keys())}, got '{variant}'."
+        )
+    return variants[variant]
diff --git a/luxonis_train/nodes/backbones/micronet.py b/luxonis_train/nodes/backbones/micronet.py
deleted file mode 100644
index 074dce2a..00000000
--- a/luxonis_train/nodes/backbones/micronet.py
+++ /dev/null
@@ -1,842 +0,0 @@
-from typing import Literal
-
-import torch
-from torch import Tensor, nn
-
-from luxonis_train.nodes.activations import HSigmoid, HSwish
-from luxonis_train.nodes.base_node import BaseNode
-from luxonis_train.nodes.blocks import ConvModule
-
-
-class MicroNet(BaseNode[Tensor, list[Tensor]]):
-    """
-
-    TODO: DOCS
-    """
-
-    def __init__(self, variant: Literal["M1", "M2", "M3"] = "M1", **kwargs):
-        """MicroNet backbone.
-
-        @type variant: Literal["M1", "M2", "M3"]
-        @param variant: Model variant to use. Defaults to "M1".
-        """
-        super().__init__(**kwargs)
-
-        if variant not in MICRONET_VARIANTS_SETTINGS:
-            raise ValueError(
-                f"MicroNet model variant should be in {list(MICRONET_VARIANTS_SETTINGS.keys())}"
-            )
-
-        self.inplanes = 64
-        (
-            in_channels,
-            stem_groups,
-            _,
-            init_a,
-            init_b,
-            out_indices,
-            channels,
-            cfgs,
-        ) = MICRONET_VARIANTS_SETTINGS[variant]
-        self.out_indices = out_indices
-        self.channels = channels
-
-        self.features = nn.ModuleList([Stem(3, 2, stem_groups)])
-
-        for (
-            stride,
-            out_channels,
-            kernel_size,
-            c1,
-            c2,
-            g1,
-            g2,
-            _,
-            g3,
-            g4,
-            y1,
-            y2,
-            y3,
-            r,
-        ) in cfgs:
-            self.features.append(
-                MicroBlock(
-                    in_channels,
-                    out_channels,
-                    kernel_size,
-                    stride,
-                    (c1, c2),
-                    (g1, g2),
-                    (g3, g4),
-                    (y1, y2, y3),
-                    r,
-                    init_a,
-                    init_b,
-                )
-            )
-            in_channels = out_channels
-
-    def forward(self, x: Tensor) -> list[Tensor]:
-        outs = []
-        for m in self.features:
-            x = m(x)
-            outs.append(x)
-        return outs
-
-
-class MicroBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        t1: tuple[int, int] = (2, 2),
-        gs1: tuple[int, int] = (0, 6),
-        groups_1x1: tuple[int, int] = (1, 1),
-        dy: tuple[int, int, int] = (2, 0, 1),
-        r: int = 1,
-        init_a: tuple[float, float] = (1.0, 1.0),
-        init_b: tuple[float, float] = (0.0, 0.0),
-    ):
-        super().__init__()
-
-        self.identity = stride == 1 and in_channels == out_channels
-        y1, y2, y3 = dy
-        g1, g2 = groups_1x1
-        reduction = 8 * r
-        intermediate_channels = in_channels * t1[0] * t1[1]
-
-        if gs1[0] == 0:
-            self.layers = nn.Sequential(
-                DepthSpatialSepConv(in_channels, t1, kernel_size, stride),
-                DYShiftMax(
-                    intermediate_channels,
-                    intermediate_channels,
-                    init_a,
-                    init_b,
-                    True if y2 == 2 else False,
-                    gs1[1],
-                    reduction,
-                )
-                if y2 > 0
-                else nn.ReLU6(True),
-                ChannelShuffle(gs1[1]),
-                ChannelShuffle(intermediate_channels // 2)
-                if y2 != 0
-                else nn.Sequential(),
-                ConvModule(
-                    in_channels=intermediate_channels,
-                    out_channels=out_channels,
-                    kernel_size=1,
-                    groups=g1,
-                    activation=nn.Identity(),
-                ),
-                DYShiftMax(
-                    out_channels,
-                    out_channels,
-                    (1.0, 0.0),
-                    (0.0, 0.0),
-                    False,
-                    g2,
-                    reduction // 2,
-                )
-                if y3 > 0
-                else nn.Sequential(),
-                ChannelShuffle(g2),
-                ChannelShuffle(out_channels // 2)
-                if out_channels % 2 == 0 and y3 != 0
-                else nn.Sequential(),
-            )
-        elif g2 == 0:
-            self.layers = nn.Sequential(
-                ConvModule(
-                    in_channels=in_channels,
-                    out_channels=intermediate_channels,
-                    kernel_size=1,
-                    groups=gs1[0],
-                    activation=nn.Identity(),
-                ),
-                DYShiftMax(
-                    intermediate_channels,
-                    intermediate_channels,
-                    (1.0, 0.0),
-                    (0.0, 0.0),
-                    False,
-                    gs1[1],
-                    reduction,
-                )
-                if y3 > 0
-                else nn.Sequential(),
-            )
-        else:
-            self.layers = nn.Sequential(
-                ConvModule(
-                    in_channels=in_channels,
-                    out_channels=intermediate_channels,
-                    kernel_size=1,
-                    groups=gs1[0],
-                    activation=nn.Identity(),
-                ),
-                DYShiftMax(
-                    intermediate_channels,
-                    intermediate_channels,
-                    init_a,
-                    init_b,
-                    True if y1 == 2 else False,
-                    gs1[1],
-                    reduction,
-                )
-                if y1 > 0
-                else nn.ReLU6(True),
-                ChannelShuffle(gs1[1]),
-                DepthSpatialSepConv(intermediate_channels, (1, 1), kernel_size, stride),
-                nn.Sequential(),
-                DYShiftMax(
-                    intermediate_channels,
-                    intermediate_channels,
-                    init_a,
-                    init_b,
-                    True if y2 == 2 else False,
-                    gs1[1],
-                    reduction,
-                    True,
-                )
-                if y2 > 0
-                else nn.ReLU6(True),
-                ChannelShuffle(intermediate_channels // 4)
-                if y1 != 0 and y2 != 0
-                else nn.Sequential()
-                if y1 == 0 and y2 == 0
-                else ChannelShuffle(intermediate_channels // 2),
-                ConvModule(
-                    in_channels=intermediate_channels,
-                    out_channels=out_channels,
-                    kernel_size=1,
-                    groups=g1,
-                    activation=nn.Identity(),
-                ),
-                DYShiftMax(
-                    out_channels,
-                    out_channels,
-                    (1.0, 0.0),
-                    (0.0, 0.0),
-                    False,
-                    g2,
-                    reduction=reduction // 2
-                    if out_channels < intermediate_channels
-                    else reduction,
-                )
-                if y3 > 0
-                else nn.Sequential(),
-                ChannelShuffle(g2),
-                ChannelShuffle(out_channels // 2) if y3 != 0 else nn.Sequential(),
-            )
-
-    def forward(self, inputs: Tensor) -> Tensor:
-        out = self.layers(inputs)
-        if self.identity:
-            out += inputs
-        return out
-
-
-class ChannelShuffle(nn.Module):
-    def __init__(self, groups: int):
-        super().__init__()
-        self.groups = groups
-
-    def forward(self, x: Tensor) -> Tensor:
-        b, c, h, w = x.size()
-        channels_per_group = c // self.groups
-        x = x.view(b, self.groups, channels_per_group, h, w)
-        x = torch.transpose(x, 1, 2).contiguous()
-        out = x.view(b, -1, h, w)
-        return out
-
-
-class DYShiftMax(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        init_a: tuple[float, float] = (0.0, 0.0),
-        init_b: tuple[float, float] = (0.0, 0.0),
-        act_relu: bool = True,
-        g: int = 6,
-        reduction: int = 4,
-        expansion: bool = False,
-    ):
-        super().__init__()
-        self.exp: Literal[2, 4] = 4 if act_relu else 2
-        self.init_a = init_a
-        self.init_b = init_b
-        self.out_channels = out_channels
-
-        self.avg_pool = nn.Sequential(nn.Sequential(), nn.AdaptiveAvgPool2d(1))
-
-        squeeze = self._make_divisible(in_channels // reduction, 4)
-
-        self.fc = nn.Sequential(
-            nn.Linear(in_channels, squeeze),
-            nn.ReLU(True),
-            nn.Linear(squeeze, out_channels * self.exp),
-            HSigmoid(),
-        )
-
-        if g != 1 and expansion:
-            g = in_channels // g
-
-        gc = in_channels // g
-        index = Tensor(range(in_channels)).view(1, in_channels, 1, 1)
-        index = index.view(1, g, gc, 1, 1)
-        indexgs = torch.split(index, [1, g - 1], dim=1)
-        indexgs = torch.cat([indexgs[1], indexgs[0]], dim=1)
-        indexs = torch.split(indexgs, [1, gc - 1], dim=2)
-        indexs = torch.cat([indexs[1], indexs[0]], dim=2)
-        self.index = indexs.view(in_channels).long()
-
-    def forward(self, x: Tensor) -> Tensor:
-        B, C, _, _ = x.shape
-        x_out = x
-
-        y = self.avg_pool(x).view(B, C)
-        y = self.fc(y).view(B, -1, 1, 1)
-        y = (y - 0.5) * 4.0
-
-        x2 = x_out[:, self.index, :, :]
-
-        if self.exp == 4:
-            a1, b1, a2, b2 = torch.split(y, self.out_channels, dim=1)
-
-            a1 = a1 + self.init_a[0]
-            a2 = a2 + self.init_b[1]
-            b1 = b1 + self.init_b[0]
-            b2 = b2 + self.init_b[1]
-
-            z1 = x_out * a1 + x2 * b1
-            z2 = x_out * a2 + x2 * b2
-
-            out = torch.max(z1, z2)
-
-        elif self.exp == 2:
-            a1, b1 = torch.split(y, self.out_channels, dim=1)
-            a1 = a1 + self.init_a[0]
-            b1 = b1 + self.init_b[0]
-            out = x_out * a1 + x2 * b1
-        else:
-            raise RuntimeError("Expansion should be 2 or 4.")
-
-        return out
-
-    def _make_divisible(self, v, divisor, min_value=None):
-        if min_value is None:
-            min_value = divisor
-        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-        # Make sure that round down does not go down by more than 10%.
-        if new_v < 0.9 * v:
-            new_v += divisor
-        return new_v
-
-
-class SwishLinear(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int):
-        super().__init__()
-        self.linear = nn.Sequential(
-            nn.Linear(in_channels, out_channels), nn.BatchNorm1d(out_channels), HSwish()
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.linear(x)
-
-
-class SpatialSepConvSF(nn.Module):
-    def __init__(
-        self, in_channels: int, outs: tuple[int, int], kernel_size: int, stride: int
-    ):
-        super().__init__()
-        out_channels1, out_channels2 = outs
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                in_channels,
-                out_channels1,
-                (kernel_size, 1),
-                (stride, 1),
-                (kernel_size // 2, 0),
-                bias=False,
-            ),
-            nn.BatchNorm2d(out_channels1),
-            nn.Conv2d(
-                out_channels1,
-                out_channels1 * out_channels2,
-                (1, kernel_size),
-                (1, stride),
-                (0, kernel_size // 2),
-                groups=out_channels1,
-                bias=False,
-            ),
-            nn.BatchNorm2d(out_channels1 * out_channels2),
-            ChannelShuffle(out_channels1),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.conv(x)
-
-
-class Stem(nn.Module):
-    def __init__(self, in_channels: int, stride: int, outs: tuple[int, int] = (4, 4)):
-        super().__init__()
-        self.stem = nn.Sequential(
-            SpatialSepConvSF(in_channels, outs, 3, stride), nn.ReLU6(True)
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.stem(x)
-
-
-class DepthSpatialSepConv(nn.Module):
-    def __init__(
-        self, in_channels: int, expand: tuple[int, int], kernel_size: int, stride: int
-    ):
-        super().__init__()
-        exp1, exp2 = expand
-        intermediate_channels = in_channels * exp1
-        out_channels = in_channels * exp1 * exp2
-
-        self.conv = nn.Sequential(
-            nn.Conv2d(
-                in_channels,
-                intermediate_channels,
-                (kernel_size, 1),
-                (stride, 1),
-                (kernel_size // 2, 0),
-                groups=in_channels,
-                bias=False,
-            ),
-            nn.BatchNorm2d(intermediate_channels),
-            nn.Conv2d(
-                intermediate_channels,
-                out_channels,
-                (1, kernel_size),
-                (1, stride),
-                (0, kernel_size // 2),
-                groups=intermediate_channels,
-                bias=False,
-            ),
-            nn.BatchNorm2d(out_channels),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self.conv(x)
-
-
-MICRONET_VARIANTS_SETTINGS = {
-    "M1": [
-        6,  # stem_ch
-        [3, 2],  # stem_groups
-        960,  # out_ch
-        [1.0, 1.0],  # init_a
-        [0.0, 0.0],  # init_b
-        [1, 2, 4, 7],  # out indices
-        [8, 16, 32, 576],
-        [
-            # s, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r
-            [2, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1, 1],
-            [2, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1, 1],
-            [
-                2,
-                16,
-                5,
-                2,
-                2,
-                0,
-                16,
-                16,
-                4,
-                4,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                1,
-                32,
-                5,
-                1,
-                6,
-                4,
-                4,
-                32,
-                4,
-                4,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                2,
-                64,
-                5,
-                1,
-                6,
-                8,
-                8,
-                64,
-                8,
-                8,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                1,
-                96,
-                3,
-                1,
-                6,
-                8,
-                8,
-                96,
-                8,
-                8,
-                2,
-                2,
-                1,
-                2,
-            ],
-            [1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1, 2],  # 96->96(4,24)->576
-        ],
-    ],
-    "M2": [
-        8,
-        [4, 2],
-        1024,
-        [1.0, 1.0],
-        [0.0, 0.0],
-        [1, 3, 6, 9],
-        [12, 24, 64, 768],
-        [
-            # s,  c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r
-            [
-                2,
-                12,
-                3,
-                2,
-                2,
-                0,
-                8,
-                12,
-                4,
-                4,
-                2,
-                0,
-                1,
-                1,
-            ],
-            [
-                2,
-                16,
-                3,
-                2,
-                2,
-                0,
-                12,
-                16,
-                4,
-                4,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                1,
-                24,
-                3,
-                2,
-                2,
-                0,
-                16,
-                24,
-                4,
-                4,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                2,
-                32,
-                5,
-                1,
-                6,
-                6,
-                6,
-                32,
-                4,
-                4,
-                2,
-                2,
-                1,
-                1,
-            ],
-            [
-                1,
-                32,
-                5,
-                1,
-                6,
-                8,
-                8,
-                32,
-                4,
-                4,
-                2,
-                2,
-                1,
-                2,
-            ],
-            [
-                1,
-                64,
-                5,
-                1,
-                6,
-                8,
-                8,
-                64,
-                8,
-                8,
-                2,
-                2,
-                1,
-                2,
-            ],
-            [
-                2,
-                96,
-                5,
-                1,
-                6,
-                8,
-                8,
-                96,
-                8,
-                8,
-                2,
-                2,
-                1,
-                2,
-            ],
-            [
-                1,
-                128,
-                3,
-                1,
-                6,
-                12,
-                12,
-                128,
-                8,
-                8,
-                2,
-                2,
-                1,
-                2,
-            ],
-            [1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1, 2],
-        ],
-    ],
-    "M3": [
-        12,
-        [4, 3],
-        1024,
-        [1.0, 0.5],
-        [0.0, 0.5],
-        [1, 3, 8, 12],
-        [16, 24, 80, 864],
-        [
-            # s,  c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r
-            [
-                2,
-                16,
-                3,
-                2,
-                2,
-                0,
-                12,
-                16,
-                4,
-                4,
-                0,
-                2,
-                0,
-                1,
-            ],
-            [
-                2,
-                24,
-                3,
-                2,
-                2,
-                0,
-                16,
-                24,
-                4,
-                4,
-                0,
-                2,
-                0,
-                1,
-            ],
-            [
-                1,
-                24,
-                3,
-                2,
-                2,
-                0,
-                24,
-                24,
-                4,
-                4,
-                0,
-                2,
-                0,
-                1,
-            ],
-            [
-                2,
-                32,
-                5,
-                1,
-                6,
-                6,
-                6,
-                32,
-                4,
-                4,
-                0,
-                2,
-                0,
-                1,
-            ],
-            [
-                1,
-                32,
-                5,
-                1,
-                6,
-                8,
-                8,
-                32,
-                4,
-                4,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                1,
-                64,
-                5,
-                1,
-                6,
-                8,
-                8,
-                48,
-                8,
-                8,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                1,
-                80,
-                5,
-                1,
-                6,
-                8,
-                8,
-                80,
-                8,
-                8,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                1,
-                80,
-                5,
-                1,
-                6,
-                10,
-                10,
-                80,
-                8,
-                8,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                2,
-                120,
-                5,
-                1,
-                6,
-                10,
-                10,
-                120,
-                10,
-                10,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                1,
-                120,
-                5,
-                1,
-                6,
-                12,
-                12,
-                120,
-                10,
-                10,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [
-                1,
-                144,
-                3,
-                1,
-                6,
-                12,
-                12,
-                144,
-                12,
-                12,
-                0,
-                2,
-                0,
-                2,
-            ],
-            [1, 864, 3, 1, 6, 12, 12, 0, 0, 0, 0, 2, 0, 2],
-        ],
-    ],
-}
diff --git a/luxonis_train/nodes/backbones/micronet/__init__.py b/luxonis_train/nodes/backbones/micronet/__init__.py
new file mode 100644
index 00000000..5b41ece3
--- /dev/null
+++ b/luxonis_train/nodes/backbones/micronet/__init__.py
@@ -0,0 +1,3 @@
+from .micronet import MicroNet
+
+__all__ = ["MicroNet"]
diff --git a/luxonis_train/nodes/backbones/micronet/blocks.py b/luxonis_train/nodes/backbones/micronet/blocks.py
new file mode 100644
index 00000000..3da5e15e
--- /dev/null
+++ b/luxonis_train/nodes/backbones/micronet/blocks.py
@@ -0,0 +1,515 @@
+from typing import Literal
+
+import torch
+from torch import Tensor, nn
+
+from luxonis_train.nodes.activations import HSigmoid
+from luxonis_train.nodes.blocks import ConvModule
+
+
+class MicroBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        expansion_ratios: tuple[int, int] = (2, 2),
+        groups_1: tuple[int, int] = (0, 6),
+        groups_2: tuple[int, int] = (1, 1),
+        use_dynamic_shift: tuple[int, int, int] = (2, 0, 1),
+        reduction_factor: int = 1,
+        init_a: tuple[float, float] = (1.0, 1.0),
+        init_b: tuple[float, float] = (0.0, 0.0),
+    ):
+        """
+        MicroBlock: The basic building block of MicroNet.
+
+        This block implements the Micro-Factorized Convolution and Dynamic Shift-Max activation.
+        It can be configured to use different combinations of these components based on the network design.
+
+        @type in_channels: int
+        @param in_channels: Number of input channels.
+        @type out_channels: int
+        @param out_channels: Number of output channels.
+        @type kernel_size: int
+        @param kernel_size: Size of the convolution kernel. Defaults to 3.
+        @type stride: int
+        @param stride: Stride of the convolution. Defaults to 1.
+        @type expansion_ratios: tuple[int, int]
+        @param expansion_ratios: Expansion ratios for the intermediate channels. Defaults to (2, 2).
+        @type groups_1: tuple[int, int]
+        @param groups_1: Groups for the first set of convolutions. Defaults to (0, 6).
+        @type groups_2: tuple[int, int]
+        @param groups_2: Groups for the second set of convolutions. Defaults to (1, 1).
+        @type use_dynamic_shift: tuple[int, int, int]
+        @param use_dynamic_shift: Flags to use Dynamic Shift-Max in different positions. Defaults to (2, 0, 1).
+        @type reduction_factor: int
+        @param reduction_factor: Reduction factor for the squeeze-and-excitation-like operation. Defaults to 1.
+        @type init_a: tuple[float, float]
+        @param init_a: Initialization parameters for Dynamic Shift-Max. Defaults to (1.0, 1.0).
+        @type init_b: tuple[float, float]
+        @param init_b: Initialization parameters for Dynamic Shift-Max. Defaults to (0.0, 0.0).
+        """
+        super().__init__()
+
+        self.use_residual = stride == 1 and in_channels == out_channels
+        self.expansion_ratios = expansion_ratios
+        use_dy1, use_dy2, use_dy3 = use_dynamic_shift
+        group1, group2 = groups_2
+        reduction = 8 * reduction_factor
+        intermediate_channels = (
+            in_channels * expansion_ratios[0] * expansion_ratios[1]
+        )
+
+        if groups_1[0] == 0:
+            self.layers = self._create_lite_block(
+                in_channels,
+                out_channels,
+                intermediate_channels,
+                kernel_size,
+                stride,
+                groups_1[1],
+                group1,
+                group2,
+                use_dy2,
+                use_dy3,
+                reduction,
+                init_a,
+                init_b,
+            )
+        elif group2 == 0:
+            self.layers = self._create_transition_block(
+                in_channels,
+                intermediate_channels,
+                groups_1[0],
+                groups_1[1],
+                use_dy3,
+                reduction,
+            )
+        else:
+            self.layers = self._create_full_block(
+                in_channels,
+                out_channels,
+                intermediate_channels,
+                kernel_size,
+                stride,
+                groups_1,
+                group1,
+                group2,
+                use_dy1,
+                use_dy2,
+                use_dy3,
+                reduction,
+                init_a,
+                init_b,
+            )
+
+    def _create_lite_block(
+        self,
+        in_channels: int,
+        out_channels: int,
+        intermediate_channels: int,
+        kernel_size: int,
+        stride: int,
+        group1: int,
+        group2: int,
+        group3: int,
+        use_dy2: int,
+        use_dy3: int,
+        reduction: int,
+        init_a: tuple[float, float],
+        init_b: tuple[float, float],
+    ) -> nn.Sequential:
+        return nn.Sequential(
+            DepthSpatialSepConv(
+                in_channels, self.expansion_ratios, kernel_size, stride
+            ),
+            DYShiftMax(
+                intermediate_channels,
+                intermediate_channels,
+                init_a,
+                init_b,
+                True if use_dy2 == 2 else False,
+                group1,
+                reduction,
+            )
+            if use_dy2 > 0
+            else nn.ReLU6(True),
+            ChannelShuffle(group1),
+            ChannelShuffle(intermediate_channels // 2)
+            if use_dy2 != 0
+            else nn.Sequential(),
+            ConvModule(
+                in_channels=intermediate_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                groups=group2,
+                activation=nn.Identity(),
+            ),
+            DYShiftMax(
+                out_channels,
+                out_channels,
+                (1.0, 0.0),
+                (0.0, 0.0),
+                False,
+                group3,
+                reduction // 2,
+            )
+            if use_dy3 > 0
+            else nn.Sequential(),
+            ChannelShuffle(group3),
+            ChannelShuffle(out_channels // 2)
+            if out_channels % 2 == 0 and use_dy3 != 0
+            else nn.Sequential(),
+        )
+
+    def _create_transition_block(
+        self,
+        in_channels: int,
+        intermediate_channels: int,
+        group1: int,
+        group2: int,
+        use_dy3: int,
+        reduction: int,
+    ) -> nn.Sequential:
+        return nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=intermediate_channels,
+                kernel_size=1,
+                groups=group1,
+                activation=nn.Identity(),
+            ),
+            DYShiftMax(
+                intermediate_channels,
+                intermediate_channels,
+                (1.0, 0.0),
+                (0.0, 0.0),
+                False,
+                group2,
+                reduction,
+            )
+            if use_dy3 > 0
+            else nn.Sequential(),
+        )
+
+    def _create_full_block(
+        self,
+        in_channels: int,
+        out_channels: int,
+        intermediate_channels: int,
+        kernel_size: int,
+        stride: int,
+        groups_1: tuple[int, int],
+        group1: int,
+        group2: int,
+        use_dy1: int,
+        use_dy2: int,
+        use_dy3: int,
+        reduction: int,
+        init_a: tuple[float, float],
+        init_b: tuple[float, float],
+    ) -> nn.Sequential:
+        return nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=intermediate_channels,
+                kernel_size=1,
+                groups=groups_1[0],
+                activation=nn.Identity(),
+            ),
+            DYShiftMax(
+                intermediate_channels,
+                intermediate_channels,
+                init_a,
+                init_b,
+                True if use_dy1 == 2 else False,
+                groups_1[1],
+                reduction,
+            )
+            if use_dy1 > 0
+            else nn.ReLU6(True),
+            ChannelShuffle(groups_1[1]),
+            DepthSpatialSepConv(
+                intermediate_channels, (1, 1), kernel_size, stride
+            ),
+            DYShiftMax(
+                intermediate_channels,
+                intermediate_channels,
+                init_a,
+                init_b,
+                True if use_dy2 == 2 else False,
+                groups_1[1],
+                reduction,
+                True,
+            )
+            if use_dy2 > 0
+            else nn.ReLU6(True),
+            ChannelShuffle(intermediate_channels // 4)
+            if use_dy1 != 0 and use_dy2 != 0
+            else nn.Sequential()
+            if use_dy1 == 0 and use_dy2 == 0
+            else ChannelShuffle(intermediate_channels // 2),
+            ConvModule(
+                in_channels=intermediate_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                groups=group1,
+                activation=nn.Identity(),
+            ),
+            DYShiftMax(
+                out_channels,
+                out_channels,
+                (1.0, 0.0),
+                (0.0, 0.0),
+                False,
+                group2,
+                reduction=reduction // 2
+                if out_channels < intermediate_channels
+                else reduction,
+            )
+            if use_dy3 > 0
+            else nn.Sequential(),
+            ChannelShuffle(group2),
+            ChannelShuffle(out_channels // 2)
+            if use_dy3 != 0
+            else nn.Sequential(),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        out = self.layers(inputs)
+        if self.use_residual:
+            out += inputs
+        return out
+
+
+class ChannelShuffle(nn.Module):
+    def __init__(self, groups: int):
+        """Shuffle the channels of the input tensor.
+
+        This operation is used to mix information between groups after
+        grouped convolutions.
+
+        @type groups: int
+        @param groups: Number of groups to divide the channels into
+            before shuffling.
+        """
+
+        super().__init__()
+        self.groups = groups
+
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, channels, height, width = x.size()
+        channels_per_group = channels // self.groups
+        x = x.view(batch_size, self.groups, channels_per_group, height, width)
+        x = torch.transpose(x, 1, 2).contiguous()
+        out = x.view(batch_size, -1, height, width)
+        return out
+
+
+class DYShiftMax(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        init_a: tuple[float, float] = (0.0, 0.0),
+        init_b: tuple[float, float] = (0.0, 0.0),
+        use_relu: bool = True,
+        groups: int = 6,
+        reduction: int = 4,
+        expansion: bool = False,
+    ):
+        """Dynamic Shift-Max activation function.
+
+        This module implements the Dynamic Shift-Max operation, which
+        adaptively fuses and selects channel information based on the
+        input.
+
+        @type in_channels: int
+        @param in_channels: Number of input channels.
+        @type out_channels: int
+        @param out_channels: Number of output channels.
+        @type init_a: tuple[float, float]
+        @param init_a: Initial values for the 'a' parameters. Defaults
+            to (0.0, 0.0).
+        @type init_b: tuple[float, float]
+        @param init_b: Initial values for the 'b' parameters. Defaults
+            to (0.0, 0.0).
+        @type use_relu: bool
+        @param use_relu: Whether to use ReLU activation. Defaults to
+            True.
+        @type groups: int
+        @param groups: Number of groups for channel shuffling. Defaults
+            to 6.
+        @type reduction: int
+        @param reduction: Reduction factor for the squeeze operation.
+            Defaults to 4.
+        @type expansion: bool
+        @param expansion: Whether to use expansion in grouping. Defaults
+            to False.
+        """
+        super().__init__()
+        self.exp: Literal[2, 4] = 4 if use_relu else 2
+        self.init_a = init_a
+        self.init_b = init_b
+        self.out_channels = out_channels
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+
+        squeeze_channels = self._make_divisible(in_channels // reduction, 4)
+
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, squeeze_channels),
+            nn.ReLU(True),
+            nn.Linear(squeeze_channels, out_channels * self.exp),
+            HSigmoid(),
+        )
+
+        if groups != 1 and expansion:
+            groups = in_channels // groups
+
+        channels_per_group = in_channels // groups
+        index = torch.arange(in_channels).view(1, in_channels, 1, 1)
+        index = index.view(1, groups, channels_per_group, 1, 1)
+        index_groups = torch.split(index, [1, groups - 1], dim=1)
+        index_groups = torch.cat([index_groups[1], index_groups[0]], dim=1)
+        index_splits = torch.split(
+            index_groups, [1, channels_per_group - 1], dim=2
+        )
+        index_splits = torch.cat([index_splits[1], index_splits[0]], dim=2)
+        self.index = index_splits.view(in_channels).long()
+
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, channels, _, _ = x.shape
+        x_out = x
+
+        y = self.avg_pool(x).view(batch_size, channels)
+        y = self.fc(y).view(batch_size, -1, 1, 1)
+        y = (y - 0.5) * 4.0
+
+        x2 = x_out[:, self.index, :, :]
+
+        if self.exp == 4:
+            a1, b1, a2, b2 = torch.split(y, self.out_channels, dim=1)
+
+            a1 = a1 + self.init_a[0]
+            a2 = a2 + self.init_b[1]
+            b1 = b1 + self.init_b[0]
+            b2 = b2 + self.init_b[1]
+
+            z1 = x_out * a1 + x2 * b1
+            z2 = x_out * a2 + x2 * b2
+
+            out = torch.max(z1, z2)
+
+        elif self.exp == 2:
+            a1, b1 = torch.split(y, self.out_channels, dim=1)
+            a1 = a1 + self.init_a[0]
+            b1 = b1 + self.init_b[0]
+            out = x_out * a1 + x2 * b1
+        else:
+            raise RuntimeError("Expansion should be 2 or 4.")
+
+        return out
+
+    def _make_divisible(
+        self, value: int, divisor: int, min_value: int | None = None
+    ) -> int:
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(value + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * value:
+            new_v += divisor
+        return new_v
+
+
+class SpatialSepConvSF(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        outs: tuple[int, int],
+        kernel_size: int,
+        stride: int,
+    ):
+        super().__init__()
+        out_channels1, out_channels2 = outs
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels1,
+                kernel_size=(kernel_size, 1),
+                stride=(stride, 1),
+                padding=(kernel_size // 2, 0),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels1),
+            nn.Conv2d(
+                out_channels1,
+                out_channels1 * out_channels2,
+                kernel_size=(1, kernel_size),
+                stride=(1, stride),
+                padding=(0, kernel_size // 2),
+                groups=out_channels1,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels1 * out_channels2),
+            ChannelShuffle(out_channels1),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv(x)
+
+
+class Stem(nn.Module):
+    def __init__(
+        self, in_channels: int, stride: int, outs: tuple[int, int] = (4, 4)
+    ):
+        super().__init__()
+        self.stem = nn.Sequential(
+            SpatialSepConvSF(in_channels, outs, 3, stride), nn.ReLU6(True)
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.stem(x)
+
+
+class DepthSpatialSepConv(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        expand: tuple[int, int],
+        kernel_size: int,
+        stride: int,
+    ):
+        super().__init__()
+        exp1, exp2 = expand
+        intermediate_channels = in_channels * exp1
+        out_channels = in_channels * exp1 * exp2
+
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                intermediate_channels,
+                (kernel_size, 1),
+                (stride, 1),
+                padding=(kernel_size // 2, 0),
+                groups=in_channels,
+                bias=False,
+            ),
+            nn.BatchNorm2d(intermediate_channels),
+            nn.Conv2d(
+                intermediate_channels,
+                out_channels,
+                (1, kernel_size),
+                (1, stride),
+                padding=(0, kernel_size // 2),
+                groups=intermediate_channels,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.conv(x)
diff --git a/luxonis_train/nodes/backbones/micronet/micronet.py b/luxonis_train/nodes/backbones/micronet/micronet.py
new file mode 100644
index 00000000..82df5cb3
--- /dev/null
+++ b/luxonis_train/nodes/backbones/micronet/micronet.py
@@ -0,0 +1,62 @@
+from typing import Any, Literal
+
+from torch import Tensor, nn
+
+from luxonis_train.nodes.base_node import BaseNode
+
+from .blocks import MicroBlock, Stem
+from .variants import get_variant
+
+
+class MicroNet(BaseNode[Tensor, list[Tensor]]):
+    def __init__(
+        self,
+        variant: Literal["M1", "M2", "M3"] = "M1",
+        out_indices: list[int] | None = None,
+        **kwargs: Any,
+    ):
+        """MicroNet backbone.
+
+        This class creates the full MicroNet architecture based on the
+        specified variant. It consists of a stem layer followed by
+        multiple MicroBlocks.
+
+        @type variant: Literal["M1", "M2", "M3"]
+        @param variant: Model variant to use. Defaults to "M1".
+        @type out_indices: list[int] | None
+        @param out_indices: Indices of the output layers. If provided,
+            overrides the variant value.
+        """
+        super().__init__(**kwargs)
+
+        var = get_variant(variant)
+        self.out_indices = out_indices or var.out_indices
+        in_channels = var.stem_channels
+
+        self.layers = nn.ModuleList([Stem(3, 2, var.stem_groups)])
+
+        for bc in var.block_configs:
+            self.layers.append(
+                MicroBlock(
+                    in_channels,
+                    bc.out_channels,
+                    bc.kernel_size,
+                    bc.stride,
+                    bc.expand_ratio,
+                    bc.groups_1,
+                    bc.groups_2,
+                    bc.dy_shifts,
+                    bc.reduction_factor,
+                    var.init_a,
+                    var.init_b,
+                )
+            )
+            in_channels = bc.out_channels
+
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        outs: list[Tensor] = []
+        for i, layer in enumerate(self.layers):
+            inputs = layer(inputs)
+            if i in self.out_indices:
+                outs.append(inputs)
+        return outs
diff --git a/luxonis_train/nodes/backbones/micronet/variants.py b/luxonis_train/nodes/backbones/micronet/variants.py
new file mode 100644
index 00000000..22a8d552
--- /dev/null
+++ b/luxonis_train/nodes/backbones/micronet/variants.py
@@ -0,0 +1,344 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class MicroBlockConfig(BaseModel):
+    stride: int
+    out_channels: int
+    kernel_size: int
+    expand_ratio: tuple[int, int]
+    groups_1: tuple[int, int]
+    groups_2: tuple[int, int]
+    dy_shifts: tuple[int, int, int]
+    reduction_factor: int
+
+
+class MicroNetVariant(BaseModel):
+    stem_channels: int
+    stem_groups: tuple[int, int]
+    init_a: tuple[float, float]
+    init_b: tuple[float, float]
+    out_indices: list[int]
+    block_configs: list[MicroBlockConfig]
+
+
+M1 = MicroNetVariant(
+    stem_channels=6,
+    stem_groups=(3, 2),
+    init_a=(1.0, 1.0),
+    init_b=(0.0, 0.0),
+    out_indices=[1, 2, 4, 7],
+    block_configs=[
+        MicroBlockConfig(
+            stride=2,
+            out_channels=8,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 6),
+            groups_2=(2, 2),
+            dy_shifts=(2, 0, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=16,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 8),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=16,
+            kernel_size=5,
+            expand_ratio=(2, 2),
+            groups_1=(0, 16),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=32,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(4, 4),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=64,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=96,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=576,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(12, 12),
+            groups_2=(0, 0),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+    ],
+)
+
+M2 = MicroNetVariant(
+    stem_channels=8,
+    stem_groups=(4, 2),
+    init_a=(1.0, 1.0),
+    init_b=(0.0, 0.0),
+    out_indices=[1, 3, 6, 9],
+    block_configs=[
+        MicroBlockConfig(
+            stride=2,
+            out_channels=12,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 8),
+            groups_2=(4, 4),
+            dy_shifts=(2, 0, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=16,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 12),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=24,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 16),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=32,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(6, 6),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=32,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(4, 4),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=64,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=96,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=128,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(12, 12),
+            groups_2=(8, 8),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=768,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(16, 16),
+            groups_2=(0, 0),
+            dy_shifts=(2, 2, 1),
+            reduction_factor=2,
+        ),
+    ],
+)
+
+M3 = MicroNetVariant(
+    stem_channels=12,
+    stem_groups=(4, 3),
+    init_a=(1.0, 0.5),
+    init_b=(0.0, 0.5),
+    out_indices=[1, 3, 8, 12],
+    block_configs=[
+        MicroBlockConfig(
+            stride=2,
+            out_channels=16,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 12),
+            groups_2=(4, 4),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=24,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 16),
+            groups_2=(4, 4),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=24,
+            kernel_size=3,
+            expand_ratio=(2, 2),
+            groups_1=(0, 24),
+            groups_2=(4, 4),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=32,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(6, 6),
+            groups_2=(4, 4),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=1,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=32,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(4, 4),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=64,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=2,
+            out_channels=80,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(8, 8),
+            groups_2=(8, 8),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=80,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(10, 10),
+            groups_2=(8, 8),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=120,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(10, 10),
+            groups_2=(10, 10),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=120,
+            kernel_size=5,
+            expand_ratio=(1, 6),
+            groups_1=(12, 12),
+            groups_2=(10, 10),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=144,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(12, 12),
+            groups_2=(12, 12),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+        MicroBlockConfig(
+            stride=1,
+            out_channels=864,
+            kernel_size=3,
+            expand_ratio=(1, 6),
+            groups_1=(12, 12),
+            groups_2=(0, 0),
+            dy_shifts=(0, 2, 0),
+            reduction_factor=2,
+        ),
+    ],
+)
+
+
+def get_variant(variant: Literal["M1", "M2", "M3"]) -> MicroNetVariant:
+    variants = {"M1": M1, "M2": M2, "M3": M3}
+    if variant not in variants:  # pragma: no cover
+        raise ValueError(
+            "MicroNet model variant should be in "
+            f"{list(variants.keys())}, got {variant}."
+        )
+    return variants[variant]
diff --git a/luxonis_train/nodes/backbones/mobilenetv2.py b/luxonis_train/nodes/backbones/mobilenetv2.py
index 48161835..8de19854 100644
--- a/luxonis_train/nodes/backbones/mobilenetv2.py
+++ b/luxonis_train/nodes/backbones/mobilenetv2.py
@@ -1,44 +1,51 @@
-"""MobileNetV2 backbone.
-
-TODO: source?
-"""
+from typing import Any
 
 import torchvision
-from torch import Tensor, nn
+from torch import Tensor
 
 from luxonis_train.nodes.base_node import BaseNode
 
 
 class MobileNetV2(BaseNode[Tensor, list[Tensor]]):
-    """Implementation of the MobileNetV2 backbone.
-
-    TODO: add more info
-    """
-
-    def __init__(self, download_weights: bool = False, **kwargs):
-        """Constructor of the MobileNetV2 backbone.
+    def __init__(
+        self,
+        download_weights: bool = False,
+        out_indices: list[int] | None = None,
+        **kwargs: Any,
+    ):
+        """MobileNetV2 backbone.
+
+        This class implements the MobileNetV2 model as described in:
+        U{MobileNetV2: Inverted Residuals and Linear Bottlenecks <https://arxiv.org/pdf/1801.04381v4>} by Sandler I{et al.}
+
+        The network consists of an initial fully convolutional layer, followed by
+        19 bottleneck residual blocks, and a final 1x1 convolution. It can be used
+        as a feature extractor for tasks like image classification, object detection,
+        and semantic segmentation.
+
+        Key features:
+            - Inverted residual structure with linear bottlenecks
+            - Depth-wise separable convolutions for efficiency
+            - Configurable width multiplier and input resolution
 
         @type download_weights: bool
         @param download_weights: If True download weights from imagenet. Defaults to
             False.
-        @type kwargs: Any
-        @param kwargs: Additional arguments to pass to L{BaseNode}.
+        @type out_indices: list[int] | None
+        @param out_indices: Indices of the output layers. Defaults to [3, 6, 13, 18].
         """
         super().__init__(**kwargs)
 
-        mobilenet_v2 = torchvision.models.mobilenet_v2(
+        self.backbone = torchvision.models.mobilenet_v2(
             weights="DEFAULT" if download_weights else None
         )
-        mobilenet_v2.classifier = nn.Identity()
-        self.out_indices = [3, 6, 13, 18]
-        self.channels = [24, 32, 96, 1280]
-        self.backbone = mobilenet_v2
-
-    def forward(self, x: Tensor) -> list[Tensor]:
-        outs = []
-        for i, module in enumerate(self.backbone.features):
-            x = module(x)
+        self.out_indices = out_indices or [3, 6, 13, 18]
+
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        outs: list[Tensor] = []
+        for i, layer in enumerate(self.backbone.features):
+            inputs = layer(inputs)
             if i in self.out_indices:
-                outs.append(x)
+                outs.append(inputs)
 
         return outs
diff --git a/luxonis_train/nodes/backbones/mobileone/__init__.py b/luxonis_train/nodes/backbones/mobileone/__init__.py
new file mode 100644
index 00000000..a6e573aa
--- /dev/null
+++ b/luxonis_train/nodes/backbones/mobileone/__init__.py
@@ -0,0 +1,3 @@
+from .mobileone import MobileOne
+
+__all__ = ["MobileOne"]
diff --git a/luxonis_train/nodes/backbones/mobileone.py b/luxonis_train/nodes/backbones/mobileone/blocks.py
similarity index 55%
rename from luxonis_train/nodes/backbones/mobileone.py
rename to luxonis_train/nodes/backbones/mobileone/blocks.py
index 2d460fd0..63e19eae 100644
--- a/luxonis_train/nodes/backbones/mobileone.py
+++ b/luxonis_train/nodes/backbones/mobileone/blocks.py
@@ -4,170 +4,12 @@
 @license: U{Apple<https://github.com/apple/ml-mobileone/blob/main/LICENSE>}
 """
 
-
-from typing import Literal
-
 import torch
 from torch import Tensor, nn
 
-from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import ConvModule, SqueezeExciteBlock
 
 
-class MobileOne(BaseNode[Tensor, list[Tensor]]):
-    """Implementation of MobileOne backbone.
-
-    TODO: add more details
-    """
-
-    in_channels: int
-
-    VARIANTS_SETTINGS: dict[str, dict] = {
-        "s0": {"width_multipliers": (0.75, 1.0, 1.0, 2.0), "num_conv_branches": 4},
-        "s1": {"width_multipliers": (1.5, 1.5, 2.0, 2.5)},
-        "s2": {"width_multipliers": (1.5, 2.0, 2.5, 4.0)},
-        "s3": {"width_multipliers": (2.0, 2.5, 3.0, 4.0)},
-        "s4": {"width_multipliers": (3.0, 3.5, 3.5, 4.0), "use_se": True},
-    }
-
-    def __init__(self, variant: Literal["s0", "s1", "s2", "s3", "s4"] = "s0", **kwargs):
-        """Constructor for the MobileOne module.
-
-        @type variant: Literal["s0", "s1", "s2", "s3", "s4"]
-        @param variant: Specifies which variant of the MobileOne network to use. For
-            details, see TODO. Defaults to "s0".
-        """
-        super().__init__(**kwargs)
-
-        if variant not in MobileOne.VARIANTS_SETTINGS.keys():
-            raise ValueError(
-                f"MobileOne model variant should be in {list(MobileOne.VARIANTS_SETTINGS.keys())}"
-            )
-
-        variant_params = MobileOne.VARIANTS_SETTINGS[variant]
-        # TODO: make configurable
-        self.width_multipliers = variant_params["width_multipliers"]
-        self.num_conv_branches = variant_params.get("num_conv_branches", 1)
-        self.num_blocks_per_stage = [2, 8, 10, 1]
-        self.use_se = variant_params.get("use_se", False)
-
-        self.in_planes = min(64, int(64 * self.width_multipliers[0]))
-
-        self.stage0 = MobileOneBlock(
-            in_channels=self.in_channels,
-            out_channels=self.in_planes,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-        )
-        self.cur_layer_idx = 1
-        self.stage1 = self._make_stage(
-            int(64 * self.width_multipliers[0]),
-            self.num_blocks_per_stage[0],
-            num_se_blocks=0,
-        )
-        self.stage2 = self._make_stage(
-            int(128 * self.width_multipliers[1]),
-            self.num_blocks_per_stage[1],
-            num_se_blocks=0,
-        )
-        self.stage3 = self._make_stage(
-            int(256 * self.width_multipliers[2]),
-            self.num_blocks_per_stage[2],
-            num_se_blocks=int(self.num_blocks_per_stage[2] // 2) if self.use_se else 0,
-        )
-        self.stage4 = self._make_stage(
-            int(512 * self.width_multipliers[3]),
-            self.num_blocks_per_stage[3],
-            num_se_blocks=self.num_blocks_per_stage[3] if self.use_se else 0,
-        )
-
-    def forward(self, inputs: Tensor) -> list[Tensor]:
-        outs = []
-        x = self.stage0(inputs)
-        outs.append(x)
-        x = self.stage1(x)
-        outs.append(x)
-        x = self.stage2(x)
-        outs.append(x)
-        x = self.stage3(x)
-        outs.append(x)
-        x = self.stage4(x)
-        outs.append(x)
-
-        return outs
-
-    def export_mode(self, export: bool = True) -> None:
-        """Sets the module to export mode.
-
-        Reparameterizes the model to obtain a plain CNN-like structure for inference.
-        TODO: add more details
-
-        @warning: The reparametrization is destructive and cannot be reversed!
-
-        @type export: bool
-        @param export: Whether to set the export mode to True or False. Defaults to True.
-        """
-        if export:
-            for module in self.modules():
-                if hasattr(module, "reparameterize"):
-                    module.reparameterize()
-
-    def _make_stage(self, planes: int, num_blocks: int, num_se_blocks: int):
-        """Build a stage of MobileOne model.
-
-        @type planes: int
-        @param planes: Number of output channels.
-        @type num_blocks: int
-        @param num_blocks: Number of blocks in this stage.
-        @type num_se_blocks: int
-        @param num_se_blocks: Number of SE blocks in this stage.
-        @rtype: nn.Sequential
-        @return: A stage of MobileOne model.
-        """
-        # Get strides for all layers
-        strides = [2] + [1] * (num_blocks - 1)
-        blocks = []
-        for ix, stride in enumerate(strides):
-            use_se = False
-            if num_se_blocks > num_blocks:
-                raise ValueError(
-                    "Number of SE blocks cannot " "exceed number of layers."
-                )
-            if ix >= (num_blocks - num_se_blocks):
-                use_se = True
-
-            # Depthwise conv
-            blocks.append(
-                MobileOneBlock(
-                    in_channels=self.in_planes,
-                    out_channels=self.in_planes,
-                    kernel_size=3,
-                    stride=stride,
-                    padding=1,
-                    groups=self.in_planes,
-                    use_se=use_se,
-                    num_conv_branches=self.num_conv_branches,
-                )
-            )
-            # Pointwise conv
-            blocks.append(
-                MobileOneBlock(
-                    in_channels=self.in_planes,
-                    out_channels=planes,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    groups=1,
-                    use_se=use_se,
-                    num_conv_branches=self.num_conv_branches,
-                )
-            )
-            self.in_planes = planes
-            self.cur_layer_idx += 1
-        return nn.Sequential(*blocks)
-
-
 class MobileOneBlock(nn.Module):
     """MobileOne building block.
 
@@ -186,7 +28,7 @@ def __init__(
         padding: int = 0,
         groups: int = 1,
         use_se: bool = False,
-        num_conv_branches: int = 1,
+        n_conv_branches: int = 1,
     ):
         """Construct a MobileOneBlock module.
 
@@ -205,9 +47,11 @@ def __init__(
         @type groups: int
         @param groups: Group number. Defaults to 1.
         @type use_se: bool
-        @param use_se: Whether to use SE-ReLU activations. Defaults to False.
-        @type num_conv_branches: int
-        @param num_conv_branches: Number of linear conv branches. Defaults to 1.
+        @param use_se: Whether to use SE-ReLU activations. Defaults to
+            False.
+        @type n_conv_branches: int
+        @param n_conv_branches: Number of linear conv branches. Defaults
+            to 1.
         """
         super().__init__()
 
@@ -216,17 +60,17 @@ def __init__(
         self.kernel_size = kernel_size
         self.in_channels = in_channels
         self.out_channels = out_channels
-        self.num_conv_branches = num_conv_branches
+        self.n_conv_branches = n_conv_branches
         self.inference_mode = False
 
-        # Check if SE-ReLU is requested
+        self.se: nn.Module
         if use_se:
             self.se = SqueezeExciteBlock(
                 in_channels=out_channels,
                 intermediate_channels=int(out_channels * 0.0625),
             )
         else:
-            self.se = nn.Identity()  # type: ignore
+            self.se = nn.Identity()
         self.activation = nn.ReLU()
 
         # Re-parameterizable skip connection
@@ -237,8 +81,8 @@ def __init__(
         )
 
         # Re-parameterizable conv branches
-        rbr_conv = list()
-        for _ in range(self.num_conv_branches):
+        rbr_conv: list[nn.Module] = []
+        for _ in range(self.n_conv_branches):
             rbr_conv.append(
                 ConvModule(
                     in_channels=self.in_channels,
@@ -265,9 +109,9 @@ def __init__(
                 activation=nn.Identity(),
             )
 
-    def forward(self, inputs: Tensor):
+    def forward(self, inputs: Tensor) -> Tensor:
         """Apply forward pass."""
-        # Inference mode forward pass.
+
         if self.inference_mode:
             return self.activation(self.se(self.reparam_conv(inputs)))
 
@@ -284,7 +128,7 @@ def forward(self, inputs: Tensor):
 
         # Other branches
         out = scale_out + identity_out
-        for ix in range(self.num_conv_branches):
+        for ix in range(self.n_conv_branches):
             out += self.rbr_conv[ix](inputs)
 
         return self.activation(self.se(out))
@@ -315,10 +159,10 @@ def reparameterize(self):
         # Delete un-used branches
         for para in self.parameters():
             para.detach_()
-        self.__delattr__("rbr_conv")
-        self.__delattr__("rbr_scale")
+        del self.rbr_conv
+        del self.rbr_scale
         if hasattr(self, "rbr_skip"):
-            self.__delattr__("rbr_skip")
+            del self.rbr_skip
 
         self.inference_mode = True
 
@@ -336,18 +180,22 @@ def _get_kernel_bias(self) -> tuple[Tensor, Tensor]:
             kernel_scale, bias_scale = self._fuse_bn_tensor(self.rbr_scale)
             # Pad scale branch kernel to match conv branch kernel size.
             pad = self.kernel_size // 2
-            kernel_scale = torch.nn.functional.pad(kernel_scale, [pad, pad, pad, pad])
+            kernel_scale = torch.nn.functional.pad(
+                kernel_scale, [pad, pad, pad, pad]
+            )
 
         # get weights and bias of skip branch
         kernel_identity = torch.zeros(())
         bias_identity = torch.zeros(())
         if self.rbr_skip is not None:
-            kernel_identity, bias_identity = self._fuse_bn_tensor(self.rbr_skip)
+            kernel_identity, bias_identity = self._fuse_bn_tensor(
+                self.rbr_skip
+            )
 
         # get weights and bias of conv branches
         kernel_conv = torch.zeros(())
         bias_conv = torch.zeros(())
-        for ix in range(self.num_conv_branches):
+        for ix in range(self.n_conv_branches):
             _kernel, _bias = self._fuse_bn_tensor(self.rbr_conv[ix])
             kernel_conv = kernel_conv + _kernel
             bias_conv = bias_conv + _bias
@@ -356,7 +204,7 @@ def _get_kernel_bias(self) -> tuple[Tensor, Tensor]:
         bias_final = bias_conv + bias_scale + bias_identity
         return kernel_final, bias_final
 
-    def _fuse_bn_tensor(self, branch) -> tuple[Tensor, Tensor]:
+    def _fuse_bn_tensor(self, branch: nn.Module) -> tuple[Tensor, Tensor]:
         """Method to fuse batchnorm layer with preceeding conv layer.
         Reference: U{https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L95}
 
@@ -374,13 +222,21 @@ def _fuse_bn_tensor(self, branch) -> tuple[Tensor, Tensor]:
             if not hasattr(self, "id_tensor"):
                 input_dim = self.in_channels // self.groups
                 kernel_value = torch.zeros(
-                    (self.in_channels, input_dim, self.kernel_size, self.kernel_size),
+                    (
+                        self.in_channels,
+                        input_dim,
+                        self.kernel_size,
+                        self.kernel_size,
+                    ),
                     dtype=branch.weight.dtype,
                     device=branch.weight.device,
                 )
                 for i in range(self.in_channels):
                     kernel_value[
-                        i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2
+                        i,
+                        i % input_dim,
+                        self.kernel_size // 2,
+                        self.kernel_size // 2,
                     ] = 1
                 self.id_tensor = kernel_value
             kernel = self.id_tensor
diff --git a/luxonis_train/nodes/backbones/mobileone/mobileone.py b/luxonis_train/nodes/backbones/mobileone/mobileone.py
new file mode 100644
index 00000000..8180f960
--- /dev/null
+++ b/luxonis_train/nodes/backbones/mobileone/mobileone.py
@@ -0,0 +1,197 @@
+"""MobileOne backbone.
+
+Source: U{<https://github.com/apple/ml-mobileone>}
+@license: U{Apple<https://github.com/apple/ml-mobileone/blob/main/LICENSE>}
+"""
+
+import logging
+from typing import Any, Literal
+
+from torch import Tensor, nn
+
+from luxonis_train.nodes.base_node import BaseNode
+
+from .blocks import MobileOneBlock
+from .variants import get_variant
+
+logger = logging.getLogger(__name__)
+
+
+class MobileOne(BaseNode[Tensor, list[Tensor]]):
+    in_channels: int
+
+    def __init__(
+        self,
+        variant: Literal["s0", "s1", "s2", "s3", "s4"] = "s0",
+        width_multipliers: tuple[float, float, float, float] | None = None,
+        n_conv_branches: int | None = None,
+        use_se: bool | None = None,
+        **kwargs: Any,
+    ):
+        """MobileOne: An efficient CNN backbone for mobile devices.
+
+        The architecture focuses on reducing memory access costs and improving parallelism
+        while allowing aggressive parameter scaling for better representation capacity.
+        Different variants (S0-S4) offer various accuracy-latency tradeoffs.
+
+        Key features:
+            - Designed for low latency on mobile while maintaining high accuracy
+            - Uses re-parameterizable branches during training that get folded at inference
+            - Employs trivial over-parameterization branches for improved accuracy
+            - Simple feed-forward structure at inference with no branches/skip connections
+            - Variants achieve <1ms inference time on iPhone 12 with up to 75.9% top-1 ImageNet accuracy
+            - Outperforms other efficient architectures like MobileNets on image classification,
+              object detection and semantic segmentation tasks
+            - Uses only basic operators available across platforms (no custom activations)
+
+
+        Reference: U{MobileOne: An Improved One millisecond Mobile Backbone
+        <https://arxiv.org/abs/2206.04040>}
+
+        @type variant: Literal["s0", "s1", "s2", "s3", "s4"]
+        @param variant: Specifies which variant of the MobileOne network to use. Defaults to "s0".
+            Each variant specifies a predefined set of values for:
+                - width multipliers - A tuple of 4 float values specifying the width multipliers for each stage of the network. If the use of SE blocks is disabled, the last two values are ignored.
+                - number of convolution branches - An integer specifying the number of linear convolution branches in MobileOne block.
+                - use of SE blocks - A boolean specifying whether to use SE blocks in the network.
+
+            The variants are as follows:
+                - s0 (default): width_multipliers=(0.75, 1.0, 1.0, 2.0), n_conv_branches=4, use_se=False
+                - s1: width_multipliers=(1.5, 1.5, 2.0, 2.5), n_conv_branches=1, use_se=False
+                - s2: width_multipliers=(1.5, 2.0, 2.5, 4.0), n_conv_branches=1, use_se=False
+                - s3: width_multipliers=(2.0, 2.5, 3.0, 4.0), n_conv_branches=1, use_se=False
+                - s4: width_multipliers=(3.0, 3.5, 3.5, 4.0), n_conv_branches=1, use_se=True
+
+        @type width_multipliers: tuple[float, float, float, float] | None
+        @param width_multipliers: Width multipliers for each stage. If provided, overrides the variant values.
+        @type n_conv_branches: int | None
+        @param n_conv_branches: Number of linear convolution branches in MobileOne block. If provided, overrides the variant values.
+        @type use_se: bool | None
+        @param use_se: Whether to use SE blocks in the network. If provided, overrides the variant value.
+        """
+        super().__init__(**kwargs)
+
+        var = get_variant(variant)
+
+        width_multipliers = width_multipliers or var.width_multipliers
+        use_se = use_se or var.use_se
+        self.n_blocks_per_stage = [2, 8, 10, 1]
+        self.n_conv_branches = n_conv_branches or var.n_conv_branches
+
+        self.in_planes = min(64, int(64 * width_multipliers[0]))
+
+        self.stage0 = MobileOneBlock(
+            in_channels=self.in_channels,
+            out_channels=self.in_planes,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(
+            int(64 * width_multipliers[0]),
+            self.n_blocks_per_stage[0],
+            n_se_blocks=0,
+        )
+        self.stage2 = self._make_stage(
+            int(128 * width_multipliers[1]),
+            self.n_blocks_per_stage[1],
+            n_se_blocks=0,
+        )
+        self.stage3 = self._make_stage(
+            int(256 * width_multipliers[2]),
+            self.n_blocks_per_stage[2],
+            n_se_blocks=self.n_blocks_per_stage[2] // 2 if use_se else 0,
+        )
+        self.stage4 = self._make_stage(
+            int(512 * width_multipliers[3]),
+            self.n_blocks_per_stage[3],
+            n_se_blocks=self.n_blocks_per_stage[3] if use_se else 0,
+        )
+
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        outs: list[Tensor] = []
+        x = self.stage0(inputs)
+        outs.append(x)
+        x = self.stage1(x)
+        outs.append(x)
+        x = self.stage2(x)
+        outs.append(x)
+        x = self.stage3(x)
+        outs.append(x)
+        x = self.stage4(x)
+        outs.append(x)
+
+        return outs
+
+    def set_export_mode(self, mode: bool = True) -> None:
+        """Sets the module to export mode.
+
+        Reparameterizes the model to obtain a plain CNN-like structure for inference.
+        TODO: add more details
+
+        @warning: The reparametrization is destructive and cannot be reversed!
+
+        @type export: bool
+        @param export: Whether to set the export mode to True or False. Defaults to True.
+        """
+        super().set_export_mode(mode)
+        if self.export:
+            logger.info("Reparametrizing 'MobileOne'.")
+            for module in self.modules():
+                if hasattr(module, "reparameterize"):
+                    module.reparameterize()
+
+    def _make_stage(self, planes: int, n_blocks: int, n_se_blocks: int):
+        """Build a stage of MobileOne model.
+
+        @type planes: int
+        @param planes: Number of output channels.
+        @type n_blocks: int
+        @param n_blocks: Number of blocks in this stage.
+        @type n_se_blocks: int
+        @param n_se_blocks: Number of SE blocks in this stage.
+        @rtype: nn.Sequential
+        @return: A stage of MobileOne model.
+        """
+        # Get strides for all layers
+        strides = [2] + [1] * (n_blocks - 1)
+        blocks: list[nn.Module] = []
+        for ix, stride in enumerate(strides):
+            use_se = False
+            if n_se_blocks > n_blocks:
+                raise ValueError(
+                    "Number of SE blocks cannot " "exceed number of layers."
+                )
+            if ix >= (n_blocks - n_se_blocks):
+                use_se = True
+
+            # Depthwise conv
+            blocks.append(
+                MobileOneBlock(
+                    in_channels=self.in_planes,
+                    out_channels=self.in_planes,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=self.in_planes,
+                    use_se=use_se,
+                    n_conv_branches=self.n_conv_branches,
+                )
+            )
+            # Pointwise conv
+            blocks.append(
+                MobileOneBlock(
+                    in_channels=self.in_planes,
+                    out_channels=planes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    use_se=use_se,
+                    n_conv_branches=self.n_conv_branches,
+                )
+            )
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.Sequential(*blocks)
diff --git a/luxonis_train/nodes/backbones/mobileone/variants.py b/luxonis_train/nodes/backbones/mobileone/variants.py
new file mode 100644
index 00000000..fbb0add3
--- /dev/null
+++ b/luxonis_train/nodes/backbones/mobileone/variants.py
@@ -0,0 +1,39 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class MobileOneVariant(BaseModel):
+    width_multipliers: tuple[float, float, float, float]
+    n_conv_branches: int = 1
+    use_se: bool = False
+
+
+def get_variant(
+    variant: Literal["s0", "s1", "s2", "s3", "s4"],
+) -> MobileOneVariant:
+    variants = {
+        "s0": MobileOneVariant(
+            width_multipliers=(0.75, 1.0, 1.0, 2.0),
+            n_conv_branches=4,
+        ),
+        "s1": MobileOneVariant(
+            width_multipliers=(1.5, 1.5, 2.0, 2.5),
+        ),
+        "s2": MobileOneVariant(
+            width_multipliers=(1.5, 2.0, 2.5, 4.0),
+        ),
+        "s3": MobileOneVariant(
+            width_multipliers=(2.0, 2.5, 3.0, 4.0),
+        ),
+        "s4": MobileOneVariant(
+            width_multipliers=(3.0, 3.5, 3.5, 4.0),
+            use_se=True,
+        ),
+    }
+    if variant not in variants:  # pragma: no cover
+        raise ValueError(
+            "MobileOne model variant should be in "
+            f"{list(variants.keys())}, got {variant}."
+        )
+    return variants[variant]
diff --git a/luxonis_train/nodes/backbones/repvgg.py b/luxonis_train/nodes/backbones/repvgg.py
deleted file mode 100644
index c536c78e..00000000
--- a/luxonis_train/nodes/backbones/repvgg.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import logging
-from typing import Literal
-
-import torch.utils.checkpoint as checkpoint
-from torch import Tensor, nn
-
-from luxonis_train.nodes.blocks import RepVGGBlock
-
-from ..base_node import BaseNode
-
-logger = logging.getLogger(__name__)
-
-
-class RepVGG(BaseNode):
-    """Implementation of RepVGG backbone.
-
-    Source: U{https://github.com/DingXiaoH/RepVGG}
-    @license: U{MIT<https://github.com/DingXiaoH/RepVGG/blob/main/LICENSE>}.
-
-    @todo: technical documentation
-    """
-
-    in_channels: int
-    attach_index: int = -1
-
-    VARIANTS_SETTINGS = {
-        "A0": {
-            "num_blocks": [2, 4, 14, 1],
-            "width_multiplier": [0.75, 0.75, 0.75, 2.5],
-        },
-        "A1": {
-            "num_blocks": [2, 4, 14, 1],
-            "width_multiplier": [1, 1, 1, 2.5],
-        },
-        "A2": {
-            "num_blocks": [2, 4, 14, 1],
-            "width_multiplier": [1.5, 1.5, 1.5, 2.75],
-        },
-    }
-
-    def __init__(
-        self,
-        variant: Literal["A0", "A1", "A2"] = "A0",
-        num_blocks: list[int] | None = None,
-        width_multiplier: list[float] | None = None,
-        override_groups_map: dict[int, int] | None = None,
-        use_se: bool = False,
-        use_checkpoint: bool = False,
-        **kwargs,
-    ):
-        """Constructor for the RepVGG module.
-
-        @type variant: Literal["A0", "A1", "A2"]
-        @param variant: RepVGG model variant. Defaults to "A0".
-        @type override_groups_map: dict[int, int] | None
-        @param override_groups_map: Dictionary mapping layer index to number of groups.
-        @type use_se: bool
-        @param use_se: Whether to use Squeeze-and-Excitation blocks.
-        @type use_checkpoint: bool
-        @param use_checkpoint: Whether to use checkpointing.
-        @type num_blocks: list[int] | None
-        @param num_blocks: Number of blocks in each stage.
-        @type width_multiplier: list[float] | None
-        @param width_multiplier: Width multiplier for each stage.
-        """
-        super().__init__(**kwargs)
-        if variant not in self.VARIANTS_SETTINGS.keys():
-            raise ValueError(
-                f"RepVGG model variant should be one of "
-                f"{list(self.VARIANTS_SETTINGS.keys())}."
-            )
-
-        num_blocks = num_blocks or self.VARIANTS_SETTINGS[variant]["num_blocks"]
-        width_multiplier = (
-            width_multiplier or self.VARIANTS_SETTINGS[variant]["width_multiplier"]
-        )
-        self.override_groups_map = override_groups_map or {}
-        assert 0 not in self.override_groups_map
-        self.use_se = use_se
-        self.use_checkpoint = use_checkpoint
-
-        self.in_planes = min(64, int(64 * width_multiplier[0]))
-        self.stage0 = RepVGGBlock(
-            in_channels=self.in_channels,
-            out_channels=self.in_planes,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            use_se=self.use_se,
-        )
-        self.cur_layer_idx = 1
-        self.stage1 = self._make_stage(
-            int(64 * width_multiplier[0]), num_blocks[0], stride=2
-        )
-        self.stage2 = self._make_stage(
-            int(128 * width_multiplier[1]), num_blocks[1], stride=2
-        )
-        self.stage3 = self._make_stage(
-            int(256 * width_multiplier[2]), num_blocks[2], stride=2
-        )
-        self.stage4 = self._make_stage(
-            int(512 * width_multiplier[3]), num_blocks[3], stride=2
-        )
-        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
-
-    def forward(self, inputs: Tensor) -> list[Tensor]:
-        outputs = []
-        out = self.stage0(inputs)
-        for stage in (self.stage1, self.stage2, self.stage3, self.stage4):
-            for block in stage:
-                if self.use_checkpoint:
-                    out = checkpoint.checkpoint(block, out)
-                else:
-                    out = block(out)
-            outputs.append(out)
-        return outputs
-
-    def _make_stage(self, planes: int, num_blocks: int, stride: int):
-        strides = [stride] + [1] * (num_blocks - 1)
-        blocks = []
-        for stride in strides:
-            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
-            blocks.append(
-                RepVGGBlock(
-                    in_channels=self.in_planes,
-                    out_channels=planes,
-                    kernel_size=3,
-                    stride=stride,
-                    padding=1,
-                    groups=cur_groups,
-                    use_se=self.use_se,
-                )
-            )
-            self.in_planes = planes
-            self.cur_layer_idx += 1
-        return nn.ModuleList(blocks)
-
-    def set_export_mode(self, mode: bool = True) -> None:
-        """Reparametrizes instances of L{RepVGGBlock} in the network.
-
-        @type mode: bool
-        @param mode: Whether to set the export mode. Defaults to C{True}.
-        """
-        super().set_export_mode(mode)
-        if self.export:
-            logger.info("Reparametrizing RepVGG.")
-            for module in self.modules():
-                if isinstance(module, RepVGGBlock):
-                    module.reparametrize()
diff --git a/luxonis_train/nodes/backbones/repvgg/__init__.py b/luxonis_train/nodes/backbones/repvgg/__init__.py
new file mode 100644
index 00000000..61a5a4fc
--- /dev/null
+++ b/luxonis_train/nodes/backbones/repvgg/__init__.py
@@ -0,0 +1,3 @@
+from .repvgg import RepVGG
+
+__all__ = ["RepVGG"]
diff --git a/luxonis_train/nodes/backbones/repvgg/repvgg.py b/luxonis_train/nodes/backbones/repvgg/repvgg.py
new file mode 100644
index 00000000..fd8a5e67
--- /dev/null
+++ b/luxonis_train/nodes/backbones/repvgg/repvgg.py
@@ -0,0 +1,135 @@
+import logging
+from collections import defaultdict
+from typing import Any, Literal
+
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor, nn
+
+from luxonis_train.nodes.base_node import BaseNode
+from luxonis_train.nodes.blocks import RepVGGBlock
+
+from .variants import get_variant
+
+logger = logging.getLogger(__name__)
+
+
+class RepVGG(BaseNode[Tensor, list[Tensor]]):
+    in_channels: int
+    attach_index: int = -1
+
+    def __init__(
+        self,
+        variant: Literal["A0", "A1", "A2"] = "A0",
+        n_blocks: tuple[int, int, int, int] | None = None,
+        width_multiplier: tuple[float, float, float, float] | None = None,
+        override_groups_map: dict[int, int] | None = None,
+        use_se: bool = False,
+        use_checkpoint: bool = False,
+        **kwargs: Any,
+    ):
+        """RepVGG backbone.
+
+        RepVGG is a VGG-style convolutional architecture.
+
+            - Simple feed-forward topology without any branching.
+            - 3x3 convolutions and ReLU activations.
+            - No automatic search, manual refinement or compound scaling.
+
+        @license: U{MIT
+            <https://github.com/DingXiaoH/RepVGG/blob/main/LICENSE>}.
+
+        @see: U{https://github.com/DingXiaoH/RepVGG}
+        @see: U{https://paperswithcode.com/method/repvgg}
+        @see: U{RepVGG: Making VGG-style ConvNets Great Again
+            <https://arxiv.org/abs/2101.03697>}
+
+
+        @type variant: Literal["A0", "A1", "A2"]
+        @param variant: RepVGG model variant. Defaults to "A0".
+        @type override_groups_map: dict[int, int] | None
+        @param override_groups_map: Dictionary mapping layer index to number of groups. The layers are indexed starting from 0.
+        @type use_se: bool
+        @param use_se: Whether to use Squeeze-and-Excitation blocks.
+        @type use_checkpoint: bool
+        @param use_checkpoint: Whether to use checkpointing.
+        @type n_blocks: tuple[int, int, int, int] | None
+        @param n_blocks: Number of blocks in each stage.
+        @type width_multiplier: tuple[float, float, float, float] | None
+        @param width_multiplier: Width multiplier for each stage.
+        """
+        super().__init__(**kwargs)
+        var = get_variant(variant)
+
+        n_blocks = n_blocks or var.n_blocks
+        width_multiplier = width_multiplier or var.width_multiplier
+        override_groups_map = defaultdict(lambda: 1, override_groups_map or {})
+        self.use_se = use_se
+        self.use_checkpoint = use_checkpoint
+
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+        self.stage0 = RepVGGBlock(
+            in_channels=self.in_channels,
+            out_channels=self.in_planes,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            use_se=self.use_se,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                block
+                for i in range(4)
+                for block in self._make_stage(
+                    int(2**i * 64 * width_multiplier[i]),
+                    n_blocks[i],
+                    stride=2,
+                    groups=override_groups_map[i],
+                )
+            ]
+        )
+        self.gap = nn.AdaptiveAvgPool2d(output_size=1)
+
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        outputs: list[Tensor] = []
+        out = self.stage0(inputs)
+        for block in self.blocks:
+            if self.use_checkpoint:
+                out = checkpoint.checkpoint(block, out)
+            else:
+                out = block(out)
+            outputs.append(out)  # type: ignore
+        return outputs
+
+    def _make_stage(
+        self, channels: int, n_blocks: int, stride: int, groups: int
+    ) -> nn.ModuleList:
+        strides = [stride] + [1] * (n_blocks - 1)
+        blocks: list[nn.Module] = []
+        for stride in strides:
+            blocks.append(
+                RepVGGBlock(
+                    in_channels=self.in_planes,
+                    out_channels=channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=groups,
+                    use_se=self.use_se,
+                )
+            )
+            self.in_planes = channels
+        return nn.ModuleList(blocks)
+
+    def set_export_mode(self, mode: bool = True) -> None:
+        """Reparametrizes instances of L{RepVGGBlock} in the network.
+
+        @type mode: bool
+        @param mode: Whether to set the export mode. Defaults to
+            C{True}.
+        """
+        super().set_export_mode(mode)
+        if self.export:
+            logger.info("Reparametrizing RepVGG.")
+            for module in self.modules():
+                if isinstance(module, RepVGGBlock):
+                    module.reparametrize()
diff --git a/luxonis_train/nodes/backbones/repvgg/variants.py b/luxonis_train/nodes/backbones/repvgg/variants.py
new file mode 100644
index 00000000..a5c734b5
--- /dev/null
+++ b/luxonis_train/nodes/backbones/repvgg/variants.py
@@ -0,0 +1,31 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class RepVGGVariant(BaseModel):
+    n_blocks: tuple[int, int, int, int]
+    width_multiplier: tuple[float, float, float, float]
+
+
+def get_variant(variant: Literal["A0", "A1", "A2"]) -> RepVGGVariant:
+    variants = {
+        "A0": RepVGGVariant(
+            n_blocks=(2, 4, 14, 1),
+            width_multiplier=(0.75, 0.75, 0.75, 2.5),
+        ),
+        "A1": RepVGGVariant(
+            n_blocks=(2, 4, 14, 1),
+            width_multiplier=(1, 1, 1, 2.5),
+        ),
+        "A2": RepVGGVariant(
+            n_blocks=(2, 4, 14, 1),
+            width_multiplier=(1.5, 1.5, 1.5, 2.75),
+        ),
+    }
+    if variant not in variants:  # pragma: no cover
+        raise ValueError(
+            f"RepVGG variant should be one of "
+            f"{list(variants.keys())}, got '{variant}'."
+        )
+    return variants[variant]
diff --git a/luxonis_train/nodes/backbones/resnet.py b/luxonis_train/nodes/backbones/resnet.py
index e4228410..93a13d4a 100644
--- a/luxonis_train/nodes/backbones/resnet.py
+++ b/luxonis_train/nodes/backbones/resnet.py
@@ -1,55 +1,98 @@
-"""ResNet backbone.
-
-Source: U{https://pytorch.org/vision/main/models/resnet.html}
-@license: U{PyTorch<https://github.com/pytorch/pytorch/blob/master/LICENSE>}
-"""
-from typing import Literal
+from typing import Any, Literal
 
 import torchvision
-from torch import Tensor, nn
+from torch import Tensor
+from torchvision.models import ResNet as TorchResNet
 
-from ..base_node import BaseNode
+from luxonis_train.nodes.base_node import BaseNode
 
 
 class ResNet(BaseNode[Tensor, list[Tensor]]):
     def __init__(
         self,
         variant: Literal["18", "34", "50", "101", "152"] = "18",
-        channels_list: list[int] | None = None,
         download_weights: bool = False,
-        **kwargs,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: tuple[bool, bool, bool] = (
+            False,
+            False,
+            False,
+        ),
+        **kwargs: Any,
     ):
-        """Implementation of the ResNetX backbone.
+        """ResNet backbone.
+
+        Implements the backbone of a ResNet (Residual Network) architecture.
+
+        ResNet is designed to address the vanishing gradient problem in deep neural networks
+        by introducing skip connections. These connections allow the network to learn
+        residual functions with reference to the layer inputs, enabling training of much
+        deeper networks.
+
+        This backbone can be used as a feature extractor for various computer vision tasks
+        such as image classification, object detection, and semantic segmentation. It
+        provides a robust set of features that can be fine-tuned for specific applications.
 
-        TODO: add more info
+        The architecture consists of stacked residual blocks, each containing convolutional
+        layers, batch normalization, and ReLU activations. The skip connections can be
+        either identity mappings or projections, depending on the block type.
 
+        Source: U{https://pytorch.org/vision/main/models/resnet.html}
+
+        @license: U{PyTorch<https://github.com/pytorch/pytorch/blob/master/LICENSE>}
+
+        @param variant: ResNet variant, determining the depth and structure of the network. Options are:
+            - "18": 18 layers, uses basic blocks, smaller model suitable for simpler tasks.
+            - "34": 34 layers, uses basic blocks, good balance of depth and computation.
+            - "50": 50 layers, introduces bottleneck blocks, deeper feature extraction.
+            - "101": 101 layers, uses bottleneck blocks, high capacity for complex tasks.
+            - "152": 152 layers, deepest variant, highest capacity but most computationally intensive.
+            The number in each variant represents the total number of weighted layers.
+            Deeper networks generally offer higher accuracy but require more computation.
         @type variant: Literal["18", "34", "50", "101", "152"]
-        @param variant: ResNet variant. Defaults to "18".
-        @type channels_list: list[int] | None
-        @param channels_list: List of channels to return.
-            If unset, defaults to [64, 128, 256, 512].
+        @default variant: "18"
 
         @type download_weights: bool
-        @param download_weights: If True download weights from imagenet.
+        @param download_weights: If True download weights trained on imagenet.
             Defaults to False.
+        @type zero_init_residual: bool
+        @param zero_init_residual: Zero-initialize the last BN in each residual branch,
+            so that the residual branch starts with zeros, and each residual block behaves like an identity.
+            This improves the model by 0.2~0.3% according to U{Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour <https://arxiv.org/abs/1706.02677>}. Defaults to C{False}.
+
+        @type groups: int
+        @param groups: Number of groups for each block.
+            Defaults to 1. Can be set to a different value only
+            for ResNet-50, ResNet-101, and ResNet-152.
+            The width of the convolutional blocks is computed as
+            C{int(in_channels * (width_per_group / 64.0)) * groups}
+
+        @type width_per_group: int
+        @param width_per_group: Number of channels per group.
+            Defaults to 64. Can be set to a different value only
+            for ResNet-50, ResNet-101, and ResNet-152.
+            The width of the convolutional blocks is computed as
+            C{int(in_channels * (width_per_group / 64.0)) * groups}
+
+        @type replace_stride_with_dilation: tuple[bool, bool, bool]
+        @param replace_stride_with_dilation: Tuple of booleans where each
+            indicates if the 2x2 strides should be replaced with a dilated convolution instead.
+            Defaults to (False, False, False). Can be set to a different value only for ResNet-50, ResNet-101, and ResNet-152.
         """
         super().__init__(**kwargs)
-
-        if variant not in RESNET_VARIANTS:
-            raise ValueError(
-                f"ResNet model variant should be in {list(RESNET_VARIANTS.keys())}"
-            )
-
-        self.backbone = RESNET_VARIANTS[variant](
-            weights="DEFAULT" if download_weights else None
+        self.backbone = self._get_backbone(
+            variant,
+            weights="DEFAULT" if download_weights else None,
+            zero_init_residual=zero_init_residual,
+            groups=groups,
+            width_per_group=width_per_group,
+            replace_stride_with_dilation=replace_stride_with_dilation,
         )
 
-        self.backbone.fc = nn.Identity()
-
-        self.channels_list = channels_list or [64, 128, 256, 512]
-
     def forward(self, inputs: Tensor) -> list[Tensor]:
-        outs = []
+        outs: list[Tensor] = []
         x = self.backbone.conv1(inputs)
         x = self.backbone.bn1(x)
         x = self.backbone.relu(x)
@@ -66,11 +109,20 @@ def forward(self, inputs: Tensor) -> list[Tensor]:
 
         return outs
 
-
-RESNET_VARIANTS = {
-    "18": torchvision.models.resnet18,
-    "34": torchvision.models.resnet34,
-    "50": torchvision.models.resnet50,
-    "101": torchvision.models.resnet101,
-    "152": torchvision.models.resnet152,
-}
+    @staticmethod
+    def _get_backbone(
+        variant: Literal["18", "34", "50", "101", "152"], **kwargs: Any
+    ) -> TorchResNet:
+        variants = {
+            "18": torchvision.models.resnet18,
+            "34": torchvision.models.resnet34,
+            "50": torchvision.models.resnet50,
+            "101": torchvision.models.resnet101,
+            "152": torchvision.models.resnet152,
+        }
+        if variant not in variants:
+            raise ValueError(
+                "ResNet model variant should be in "
+                f"{list(variants.keys())}, got {variant}."
+            )
+        return variants[variant](**kwargs)
diff --git a/luxonis_train/nodes/backbones/rexnetv1.py b/luxonis_train/nodes/backbones/rexnetv1.py
index 6d23857e..6567586a 100644
--- a/luxonis_train/nodes/backbones/rexnetv1.py
+++ b/luxonis_train/nodes/backbones/rexnetv1.py
@@ -1,15 +1,11 @@
-"""Implementation of the ReXNetV1 backbone.
-
-Source: U{https://github.com/clovaai/rexnet}
-@license: U{MIT<https://github.com/clovaai/rexnet/blob/master/LICENSE>}
-"""
+from typing import Any
 
 import torch
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import ConvModule
-from luxonis_train.utils.general import make_divisible
+from luxonis_train.utils import make_divisible
 
 
 class ReXNetV1_lite(BaseNode[Tensor, list[Tensor]]):
@@ -21,10 +17,33 @@ def __init__(
         final_ch: int = 164,
         multiplier: float = 1.0,
         kernel_sizes: int | list[int] = 3,
-        **kwargs,
+        out_indices: list[int] | None = None,
+        **kwargs: Any,
     ):
-        """ReXNetV1_lite backbone.
+        """ReXNetV1 (Rank Expansion Networks) backbone, lite version.
+
+        ReXNet proposes a new approach to designing lightweight CNN architectures by:
+
+            - Studying proper channel dimension expansion at the layer level using rank analysis
+            - Searching for effective channel configurations across the entire network
+            - Parameterizing channel dimensions as a linear function of network depth
+
+        Key aspects:
+
+            - Uses inverted bottleneck blocks similar to MobileNetV2
+            - Employs a linear parameterization of channel dimensions across blocks
+            - Replaces ReLU6 with SiLU (Swish-1) activation in certain layers
+            - Incorporates Squeeze-and-Excitation modules
+
+        ReXNet achieves state-of-the-art performance among lightweight models on ImageNet
+        classification and transfers well to tasks like object detection and fine-grained classification.
 
+        Source: U{https://github.com/clovaai/rexnet}
+
+        @license: U{MIT
+            <https://github.com/clovaai/rexnet/blob/master/LICENSE>}
+        @copyright: 2021-present NAVER Corp.
+        @see U{Rethinking Channel Dimensions for Efficient Model Design <https://arxiv.org/abs/2007.00992>}
         @type fix_head_stem: bool
         @param fix_head_stem: Whether to multiply head stem. Defaults to False.
         @type divisible_value: int
@@ -37,40 +56,44 @@ def __init__(
         @param multiplier: Channel dimension multiplier. Defaults to 1.0.
         @type kernel_sizes: int | list[int]
         @param kernel_sizes: Kernel size for each block. Defaults to 3.
+        @param out_indices: list[int] | None
+        @param out_indices: Indices of the output layers. Defaults to [1, 4, 10, 17].
         """
         super().__init__(**kwargs)
 
-        self.out_indices = [1, 4, 10, 17]
-        self.channels = [16, 48, 112, 184]
         layers = [1, 2, 2, 3, 3, 5]
         strides = [1, 2, 2, 2, 1, 2]
 
+        self.n_convblocks = sum(layers)
+        self.out_indices = out_indices or [1, 4, 10, 17]
+
         kernel_sizes = (
-            [kernel_sizes] * 6 if isinstance(kernel_sizes, int) else kernel_sizes
+            [kernel_sizes] * 6
+            if isinstance(kernel_sizes, int)
+            else kernel_sizes
         )
 
-        strides = sum(
-            [
-                [element] + [1] * (layers[idx] - 1)
-                for idx, element in enumerate(strides)
-            ],
-            [],
-        )
+        strides = [
+            s if i == 0 else 1
+            for layer, s in zip(layers, strides)
+            for i in range(layer)
+        ]
         ts = [1] * layers[0] + [6] * sum(layers[1:])
-        kernel_sizes = sum(
-            [[element] * layers[idx] for idx, element in enumerate(kernel_sizes)], []
-        )
-        self.num_convblocks = sum(layers[:])
+        kernel_sizes = [
+            ks for ks, layer in zip(kernel_sizes, layers) for _ in range(layer)
+        ]
 
         features: list[nn.Module] = []
         inplanes = input_ch / multiplier if multiplier < 1.0 else input_ch
-        first_channel = 32 / multiplier if multiplier < 1.0 or fix_head_stem else 32
+        first_channel = (
+            32 / multiplier if multiplier < 1.0 or fix_head_stem else 32
+        )
         first_channel = make_divisible(
             int(round(first_channel * multiplier)), divisible_value
         )
 
-        in_channels_group = []
-        channels_group = []
+        in_channels_group: list[int] = []
+        channels_group: list[int] = []
 
         features.append(
             ConvModule(
@@ -83,7 +106,7 @@ def __init__(
             )
         )
 
-        for i in range(self.num_convblocks):
+        for i in range(self.n_convblocks):
             inplanes_divisible = make_divisible(
                 int(round(inplanes * multiplier)), divisible_value
             )
@@ -92,7 +115,7 @@ def __init__(
                 channels_group.append(inplanes_divisible)
             else:
                 in_channels_group.append(inplanes_divisible)
-                inplanes += final_ch / (self.num_convblocks - 1 * 1.0)
+                inplanes += final_ch / (self.n_convblocks - 1 * 1.0)
                 inplanes_divisible = make_divisible(
                     int(round(inplanes * multiplier)), divisible_value
                 )
@@ -100,7 +123,12 @@ def __init__(
 
         assert channels_group
         for in_c, c, t, k, s in zip(
-            in_channels_group, channels_group, ts, kernel_sizes, strides, strict=True
+            in_channels_group,
+            channels_group,
+            ts,
+            kernel_sizes,
+            strides,
+            strict=True,
         ):
             features.append(
                 LinearBottleneck(
@@ -109,7 +137,9 @@ def __init__(
             )
 
         pen_channels = (
-            int(1280 * multiplier) if multiplier > 1 and not fix_head_stem else 1280
+            int(1280 * multiplier)
+            if multiplier > 1 and not fix_head_stem
+            else 1280
         )
         features.append(
             ConvModule(
@@ -121,12 +151,12 @@ def __init__(
         )
         self.features = nn.Sequential(*features)
 
-    def forward(self, x: Tensor) -> list[Tensor]:
-        outs = []
+    def forward(self, inputs: Tensor) -> list[Tensor]:
+        outs: list[Tensor] = []
         for i, module in enumerate(self.features):
-            x = module(x)
+            inputs = module(inputs)
             if i in self.out_indices:
-                outs.append(x)
+                outs.append(inputs)
         return outs
 
 
@@ -138,14 +168,12 @@ def __init__(
         t: int,
         kernel_size: int = 3,
         stride: int = 1,
-        **kwargs,
     ):
-        super(LinearBottleneck, self).__init__(**kwargs)
-        self.conv_shortcut = None
+        super().__init__()
         self.use_shortcut = stride == 1 and in_channels <= channels
         self.in_channels = in_channels
         self.out_channels = channels
-        out = []
+        out: list[nn.Module] = []
         if t != 1:
             dw_channels = in_channels * t
             out.append(
diff --git a/luxonis_train/nodes/base_node.py b/luxonis_train/nodes/base_node.py
index 9db45316..aad0b2f2 100644
--- a/luxonis_train/nodes/base_node.py
+++ b/luxonis_train/nodes/base_node.py
@@ -1,25 +1,26 @@
 import inspect
+import logging
 from abc import ABC, abstractmethod
+from contextlib import suppress
 from typing import Generic, TypeVar
 
+from luxonis_ml.data import LabelType
 from luxonis_ml.utils.registry import AutoRegisterMeta
-from pydantic import BaseModel, ValidationError
 from torch import Size, Tensor, nn
+from typeguard import TypeCheckError, check_type
 
-from luxonis_train.utils.general import DatasetMetadata, validate_packet
-from luxonis_train.utils.registry import NODES
-from luxonis_train.utils.types import (
+from luxonis_train.utils import (
     AttachIndexType,
-    FeaturesProtocol,
+    DatasetMetadata,
     IncompatibleException,
-    LabelType,
     Packet,
 )
+from luxonis_train.utils.registry import NODES
 
 ForwardOutputT = TypeVar("ForwardOutputT")
 ForwardInputT = TypeVar("ForwardInputT")
 
-__all__ = ["BaseNode"]
+logger = logging.getLogger(__name__)
 
 
 class BaseNode(
@@ -41,13 +42,10 @@ class BaseNode(
     of lists of tensors. Each key in the dictionary represents a different output
     from the previous node. Input to the node is a list of L{Packet}s, output is a single L{Packet}.
 
-    Each node can define a list of L{BaseProtocol}s that the inputs must conform to.
-    L{BaseProtocol} is a pydantic model that defines the structure of the input.
-    When the node is called, the inputs are validated against the protocols and
-    then sent to the L{unwrap} method. The C{unwrap} method should return a valid
-    input to the L{forward} method. Outputs of the C{forward} method are then
-    send to L{weap} method, which wraps the output into a C{Packet}, which is the
-    output of the node.
+    When the node is called, the inputs are sent to the L{unwrap} method.
+    The C{unwrap} method should return a valid input to the L{forward} method.
+    Outputs of the C{forward} method are then send to L{wrap} method,
+    which wraps the output into a C{Packet}. The wrapped C{Packet} is the final output of the node.
 
     The L{run} method combines the C{unwrap}, C{forward} and C{wrap} methods
     together with input validation.
@@ -55,13 +53,12 @@ class BaseNode(
     When subclassing, the following methods should be implemented:
         - L{forward}: Forward pass of the module.
         - L{unwrap}: Optional. Unwraps the inputs from the input packet.
-          The default implementation expects a single input with `features` key.
+            The default implementation expects a single input with `features` key.
         - L{wrap}: Optional. Wraps the output of the forward pass
-          into a `Packet[Tensor]`. The default implementation expects wraps the output
-          of the forward pass into a packet with either "features" or the task name as the key.
+            into a `Packet[Tensor]`. The default implementation expects wraps the output
+            of the forward pass into a packet with either "features" or the task name as the key.
 
     Additionally, the following class attributes can be defined:
-        - L{input_protocols}: List of input protocols used to validate inputs to the node.
         - L{attach_index}: Index of previous output that this node attaches to.
         - L{tasks}: Dictionary of tasks that the node supports.
 
@@ -94,32 +91,6 @@ def wrap(output: Tensor) -> Packet[Tensor]:
                 # by the attached modules.
                 return {"classification": [output]}
 
-    @type input_shapes: list[Packet[Size]] | None
-    @param input_shapes: List of input shapes for the module.
-
-    @type original_in_shape: Size | None
-    @param original_in_shape: Original input shape of the model. Some
-        nodes won't function if not provided.
-
-    @type dataset_metadata: L{DatasetMetadata} | None
-    @param dataset_metadata: Metadata of the dataset.
-        Some nodes won't function if not provided.
-
-    @type n_classes: int | None
-    @param n_classes: Number of classes in the dataset. Provide only
-        in case `dataset_metadata` is not provided. Defaults to None.
-
-    @type in_sizes: Size | list[Size] | None
-    @param in_sizes: List of input sizes for the node.
-        Provide only in case the `input_shapes` were not provided.
-
-    @type _tasks: dict[LabelType, str] | None
-    @param _tasks: Dictionary of tasks that the node supports. Overrides the
-        class L{tasks} attribute. Shouldn't be provided by the user in most cases.
-
-    @type input_protocols: list[type[BaseModel]]
-    @ivar input_protocols: List of input protocols used to validate inputs to the node.
-        Defaults to [L{FeaturesProtocol}].
 
     @type attach_index: AttachIndexType
     @ivar attach_index: Index of previous output that this node attaches to.
@@ -135,7 +106,6 @@ class L{tasks} attribute. Shouldn't be provided by the user in most cases.
         Only needs to be defined for head nodes.
     """
 
-    input_protocols: list[type[BaseModel]] = [FeaturesProtocol]
     attach_index: AttachIndexType
     tasks: list[LabelType] | dict[LabelType, str] | None = None
 
@@ -148,10 +118,50 @@ def __init__(
         n_classes: int | None = None,
         n_keypoints: int | None = None,
         in_sizes: Size | list[Size] | None = None,
+        attach_index: AttachIndexType | None = None,
         _tasks: dict[LabelType, str] | None = None,
     ):
+        """Constructor for the BaseNode.
+
+        @type input_shapes: list[Packet[Size]] | None
+        @param input_shapes: List of input shapes for the module.
+
+        @type original_in_shape: Size | None
+        @param original_in_shape: Original input shape of the model. Some
+            nodes won't function if not provided.
+
+        @type dataset_metadata: L{DatasetMetadata} | None
+        @param dataset_metadata: Metadata of the dataset.
+            Some nodes won't function if not provided.
+
+        @type n_classes: int | None
+        @param n_classes: Number of classes in the dataset. Provide only
+            in case `dataset_metadata` is not provided. Defaults to None.
+
+        @type in_sizes: Size | list[Size] | None
+        @param in_sizes: List of input sizes for the node.
+            Provide only in case the `input_shapes` were not provided.
+
+        @type attach_index: AttachIndexType
+        @param attach_index: Index of previous output that this node attaches to.
+            Can be a single integer to specify a single output, a tuple of
+            two or three integers to specify a range of outputs or `"all"` to
+            specify all outputs. Defaults to "all". Python indexing conventions apply. If provided as a constructor argument, overrides the class attribute.
+
+
+        @type _tasks: dict[LabelType, str] | None
+        @param _tasks: Dictionary of tasks that the node supports. Overrides the
+            class L{tasks} attribute. Shouldn't be provided by the user in most cases.
+        """
         super().__init__()
 
+        if attach_index is not None:
+            logger.warning(
+                f"Node {self.name} overrides `attach_index` "
+                f"by setting it to '{attach_index}'. "
+                "Make sure this is intended."
+            )
+            self.attach_index = attach_index
         self._tasks = None
         if _tasks is not None:
             self._tasks = _tasks
@@ -180,15 +190,36 @@ def __init__(
         self._epoch = 0
         self._in_sizes = in_sizes
 
+        self._check_type_overrides()
+
     @staticmethod
     def _process_tasks(
         tasks: dict[LabelType, str] | list[LabelType],
     ) -> dict[LabelType, str]:
         if isinstance(tasks, dict):
             return tasks
-        if isinstance(tasks, list):
+        else:
             return {task: task.value for task in tasks}
 
+    def _check_type_overrides(self) -> None:
+        properties = []
+        for name, value in inspect.getmembers(self.__class__):
+            if isinstance(value, property):
+                properties.append(name)
+        for name, typ in self.__annotations__.items():
+            if name in properties:
+                with suppress(RuntimeError):
+                    value = getattr(self, name)
+                    try:
+                        check_type(value, typ)
+                    except TypeCheckError as e:
+                        raise IncompatibleException(
+                            f"Node '{self.name}' specifies the type of the property `{name}` as `{typ}`, "
+                            f"but received `{type(value)}`. "
+                            f"This may indicate that the '{self.name}' node is "
+                            "not compatible with its predecessor."
+                        ) from e
+
     def get_task_name(self, task: LabelType) -> str:
         """Gets the name of a task for a particular C{LabelType}.
 
@@ -196,14 +227,15 @@ def get_task_name(self, task: LabelType) -> str:
         @param task: Task to get the name for.
         @rtype: str
         @return: Name of the task.
+        @raises RuntimeError: If the node does not define any tasks.
         @raises ValueError: If the task is not supported by the node.
         """
         if not self._tasks:
-            raise ValueError(f"Node {self.name} does not have any tasks defined.")
+            raise RuntimeError(f"Node '{self.name}' does not define any task.")
 
         if task not in self._tasks:
             raise ValueError(
-                f"Node {self.name} does not support the {task.value} task."
+                f"Node '{self.name}' does not support the '{task.value}' task."
             )
         return self._tasks[task]
 
@@ -213,14 +245,20 @@ def name(self) -> str:
 
     @property
     def task(self) -> str:
-        """Getter for the task."""
+        """Getter for the task.
+
+        @type: str
+        @raises RuntimeError: If the node doesn't define any task.
+        @raises ValueError: If the node defines more than one task. In
+            that case, use the L{get_task_name} method instead.
+        """
         if not self._tasks:
-            raise ValueError(f"{self.name} does not have any tasks defined.")
+            raise RuntimeError(f"{self.name} does not define any task.")
 
         if len(self._tasks) > 1:
             raise ValueError(
                 f"Node {self.name} has multiple tasks defined. "
-                "Use `get_task_name` method instead."
+                "Use the `get_task_name` method instead."
             )
         return next(iter(self._tasks.values()))
 
@@ -242,22 +280,27 @@ def get_class_names(self, task: LabelType) -> list[str]:
         @rtype: list[str]
         @return: Class names for the task.
         """
-        return self.dataset_metadata.class_names(self.get_task_name(task))
+        return self.dataset_metadata.classes(self.get_task_name(task))
 
     @property
     def n_keypoints(self) -> int:
-        """Getter for the number of keypoints."""
+        """Getter for the number of keypoints.
+
+        @type: int
+        @raises ValueError: If the node does not support keypoints.
+        @raises RuntimeError: If the node doesn't define any task.
+        """
         if self._n_keypoints is not None:
             return self._n_keypoints
 
         if self._tasks:
             if LabelType.KEYPOINTS not in self._tasks:
-                raise (ValueError(f"{self.name} does not support keypoints."))
+                raise ValueError(f"{self.name} does not support keypoints.")
             return self.dataset_metadata.n_keypoints(
                 self.get_task_name(LabelType.KEYPOINTS)
             )
 
-        raise ValueError(
+        raise RuntimeError(
             f"{self.name} does not have any tasks defined, "
             "`BaseNode.n_keypoints` property cannot be used. "
             "Either override the `tasks` class attribute, "
@@ -267,12 +310,19 @@ def n_keypoints(self) -> int:
 
     @property
     def n_classes(self) -> int:
-        """Getter for the number of classes."""
+        """Getter for the number of classes.
+
+        @type: int
+        @raises RuntimeError: If the node doesn't define any task.
+        @raises ValueError: If the number of classes is different for
+            different tasks. In that case, use the L{get_n_classes}
+            method.
+        """
         if self._n_classes is not None:
             return self._n_classes
 
         if not self._tasks:
-            raise ValueError(
+            raise RuntimeError(
                 f"{self.name} does not have any tasks defined, "
                 "`BaseNode.n_classes` property cannot be used. "
                 "Either override the `tasks` class attribute, "
@@ -296,9 +346,16 @@ def n_classes(self) -> int:
 
     @property
     def class_names(self) -> list[str]:
-        """Getter for the class names."""
+        """Getter for the class names.
+
+        @type: list[str]
+        @raises RuntimeError: If the node doesn't define any task.
+        @raises ValueError: If the class names are different for
+            different tasks. In that case, use the L{get_class_names}
+            method.
+        """
         if not self._tasks:
-            raise ValueError(
+            raise RuntimeError(
                 f"{self.name} does not have any tasks defined, "
                 "`BaseNode.class_names` property cannot be used. "
                 "Either override the `tasks` class attribute, "
@@ -306,10 +363,10 @@ def class_names(self) -> list[str]:
                 "the `BaseNode.dataset_metadata.class_names` method manually."
             )
         elif len(self._tasks) == 1:
-            return self.dataset_metadata.class_names(self.task)
+            return self.dataset_metadata.classes(self.task)
         else:
             class_names = [
-                self.dataset_metadata.class_names(self.get_task_name(task))
+                self.dataset_metadata.classes(self.get_task_name(task))
                 for task in self._tasks
             ]
             if all(set(names) == set(class_names[0]) for names in class_names):
@@ -322,14 +379,25 @@ def class_names(self) -> list[str]:
 
     @property
     def input_shapes(self) -> list[Packet[Size]]:
-        """Getter for the input shapes."""
+        """Getter for the input shapes.
+
+        @type: list[Packet[Size]]
+        @raises RuntimeError: If the C{input_shapes} were not set during
+            initialization.
+        """
+
         if self._input_shapes is None:
             raise self._non_set_error("input_shapes")
         return self._input_shapes
 
     @property
     def original_in_shape(self) -> Size:
-        """Getter for the original input shape."""
+        """Getter for the original input shape as [N, H, W].
+
+        @type: Size
+        @raises RuntimeError: If the C{original_in_shape} were not set
+            during initialization.
+        """
         if self._original_in_shape is None:
             raise self._non_set_error("original_in_shape")
         return self._original_in_shape
@@ -339,10 +407,11 @@ def dataset_metadata(self) -> DatasetMetadata:
         """Getter for the dataset metadata.
 
         @type: L{DatasetMetadata}
-        @raises ValueError: If the C{dataset_metadata} is C{None}.
+        @raises RuntimeError: If the C{dataset_metadata} were not set
+            during initialization.
         """
         if self._dataset_metadata is None:
-            raise ValueError(
+            raise RuntimeError(
                 f"{self._non_set_error('dataset_metadata')}"
                 "Either provide `dataset_metadata` or `n_classes`."
             )
@@ -358,7 +427,7 @@ def in_sizes(self) -> Size | list[Size]:
         In case `in_sizes` were provided during initialization, they are returned
         directly.
 
-        Example:
+        Example::
 
             >>> input_shapes = [{"features": [Size(64, 128, 128), Size(3, 224, 224)]}]
             >>> attach_index = -1
@@ -369,7 +438,7 @@ def in_sizes(self) -> Size | list[Size]:
             >>> in_sizes = [Size(64, 128, 128), Size(3, 224, 224)]
 
         @type: Size | list[Size]
-        @raises IncompatibleException: If the C{input_shapes} are too complicated for
+        @raises RuntimeError: If the C{input_shapes} are too complicated for
             the default implementation.
         """
         if self._in_sizes is not None:
@@ -377,27 +446,25 @@ def in_sizes(self) -> Size | list[Size]:
 
         features = self.input_shapes[0].get("features")
         if features is None:
-            raise IncompatibleException(
+            raise RuntimeError(
                 f"Feature field is missing in {self.name}. "
                 "The default implementation of `in_sizes` cannot be used."
             )
-        shapes = self.get_attached(self.input_shapes[0]["features"])
-        if isinstance(shapes, list) and len(shapes) == 1:
-            return shapes[0]
-        return shapes
+        return self.get_attached(self.input_shapes[0]["features"])
 
     @property
     def in_channels(self) -> int | list[int]:
         """Simplified getter for the number of input channels.
 
-        Should work out of the box for most cases where the C{input_shapes} are
-        sufficiently simple. Otherwise the C{input_shapes} should be used directly. If
-        C{attach_index} is set to "all" or is a slice, returns a list of input channels,
+        Should work out of the box for most cases where the
+        C{input_shapes} are sufficiently simple. Otherwise the
+        C{input_shapes} should be used directly. If C{attach_index} is
+        set to "all" or is a slice, returns a list of input channels,
         otherwise returns a single value.
 
         @type: int | list[int]
-        @raises IncompatibleException: If the C{input_shapes} are too complicated for
-            the default implementation.
+        @raises RuntimeError: If the C{input_shapes} are too complicated
+            for the default implementation of C{in_sizes}.
         """
         return self._get_nth_size(-3)
 
@@ -409,8 +476,8 @@ def in_height(self) -> int | list[int]:
         sufficiently simple. Otherwise the `input_shapes` should be used directly.
 
         @type: int | list[int]
-        @raises IncompatibleException: If the C{input_shapes} are too complicated for
-            the default implementation.
+        @raises RuntimeError: If the C{input_shapes} are too complicated for
+            the default implementation of C{in_sizes}.
         """
         return self._get_nth_size(-2)
 
@@ -422,8 +489,8 @@ def in_width(self) -> int | list[int]:
         sufficiently simple. Otherwise the `input_shapes` should be used directly.
 
         @type: int | list[int]
-        @raises IncompatibleException: If the C{input_shapes} are too complicated for
-            the default implementation.
+        @raises RuntimeError: If the C{input_shapes} are too complicated for
+            the default implementation of C{in_sizes}.
         """
         return self._get_nth_size(-1)
 
@@ -443,23 +510,26 @@ def set_export_mode(self, mode: bool = True) -> None:
     def unwrap(self, inputs: list[Packet[Tensor]]) -> ForwardInputT:
         """Prepares inputs for the forward pass.
 
-        Unwraps the inputs from the C{list[Packet[Tensor]]} input so they can be passed
-        to the forward call. The default implementation expects a single input with
-        C{features} key and returns the tensor or tensors at the C{attach_index}
-        position.
+        Unwraps the inputs from the C{list[Packet[Tensor]]} input so
+        they can be passed to the forward call. The default
+        implementation expects a single input with C{features} key and
+        returns the tensor or tensors at the C{attach_index} position.
 
-        For most cases the default implementation should be sufficient. Exceptions are
-        modules with multiple inputs or producing more complex outputs. This is
-        typically the case for output nodes.
+        For most cases the default implementation should be sufficient.
+        Exceptions are modules with multiple inputs or producing more
+        complex outputs. This is typically the case for output nodes.
 
         @type inputs: list[Packet[Tensor]]
         @param inputs: Inputs to the node.
         @rtype: ForwardInputT
-        @return: Prepared inputs, ready to be passed to the L{forward} method.
+        @return: Prepared inputs, ready to be passed to the L{forward}
+            method.
+        @raises ValueError: If the number of inputs is not equal to 1.
+            In such cases the method has to be overridden.
         """
         if len(inputs) > 1:
-            raise IncompatibleException(
-                f"Node {self.name} expects a single input, but got {len(inputs)} inputs instead."
+            raise ValueError(
+                f"Node {self.name} expects a single input, but got {len(inputs)} inputs instead. "
                 "If the node expects multiple inputs, the `unwrap` method should be overridden."
             )
         return self.get_attached(inputs[0]["features"])  # type: ignore
@@ -468,9 +538,9 @@ def unwrap(self, inputs: list[Packet[Tensor]]) -> ForwardInputT:
     def forward(self, inputs: ForwardInputT) -> ForwardOutputT:
         """Forward pass of the module.
 
-        @type inputs: ForwardInputT
+        @type inputs: L{ForwardInputT}
         @param inputs: Inputs to the module.
-        @rtype: ForwardOutputT
+        @rtype: L{ForwardOutputT}
         @return: Result of the forward pass.
         """
         ...
@@ -502,27 +572,30 @@ def wrap(self, output: ForwardOutputT) -> Packet[Tensor]:
 
         @rtype: L{Packet}[Tensor]
         @return: Wrapped output.
+
+        @raises ValueError: If the C{output} argument is not a tensor or a list of tensors.
+            In such cases the L{wrap} method should be overridden.
         """
 
-        match output:
-            case Tensor() as out:
-                outputs = [out]
-            case list(tensors) if all(isinstance(t, Tensor) for t in tensors):
-                outputs = tensors
-            case _:
-                raise IncompatibleException(
-                    "Default `wrap` expects a single tensor or a list of tensors."
-                )
+        if isinstance(output, Tensor):
+            outputs = [output]
+        elif isinstance(output, (list, tuple)) and all(
+            isinstance(t, Tensor) for t in output
+        ):
+            outputs = list(output)
+        else:
+            raise ValueError(
+                "Default `wrap` expects a single tensor or a list of tensors."
+            )
         try:
             task = self.task
-        except ValueError:
+        except RuntimeError:
             task = "features"
         return {task: outputs}
 
     def run(self, inputs: list[Packet[Tensor]]) -> Packet[Tensor]:
-        """Combines the forward pass with the wrapping and unwrapping of the inputs.
-
-        Additionally validates the inputs against `input_protocols`.
+        """Combines the forward pass with the wrapping and unwrapping of
+        the inputs.
 
         @type inputs: list[Packet[Tensor]]
         @param inputs: Inputs to the module.
@@ -531,9 +604,9 @@ def run(self, inputs: list[Packet[Tensor]]) -> Packet[Tensor]:
         @return: Outputs of the module as a dictionary of list of tensors:
             `{"features": [Tensor, ...], "segmentation": [Tensor]}`
 
-        @raises IncompatibleException: If the inputs are not compatible with the node.
+        @raises RuntimeError: If default L{wrap} or L{unwrap} methods are not sufficient.
         """
-        unwrapped = self.unwrap(self.validate(inputs))
+        unwrapped = self.unwrap(inputs)
         outputs = self(unwrapped)
         wrapped = self.wrap(outputs)
         str_tasks = [task.value for task in self._tasks] if self._tasks else []
@@ -543,38 +616,21 @@ def run(self, inputs: list[Packet[Tensor]]) -> Packet[Tensor]:
                 wrapped[self.get_task_name(LabelType(key))] = value
         return wrapped
 
-    def validate(self, data: list[Packet[Tensor]]) -> list[Packet[Tensor]]:
-        """Validates the inputs against `input_protocols`."""
-        if len(data) != len(self.input_protocols):
-            raise IncompatibleException(
-                f"Node {self.name} expects {len(self.input_protocols)} inputs, "
-                f"but got {len(data)} inputs instead."
-            )
-        try:
-            return [
-                validate_packet(d, protocol)
-                for d, protocol in zip(data, self.input_protocols)
-            ]
-        except ValidationError as e:
-            raise IncompatibleException.from_validation_error(e, self.name) from e
-
     T = TypeVar("T", Tensor, Size)
 
     def get_attached(self, lst: list[T]) -> list[T] | T:
         """Gets the attached elements from a list.
 
-        This method is used to get the attached elements from a list based on
-        the `attach_index` attribute.
+        This method is used to get the attached elements from a list
+        based on the C{attach_index} attribute.
 
         @type lst: list[T]
-        @param lst: List to get the attached elements from. Can be either
-            a list of tensors or a list of sizes.
-
+        @param lst: List to get the attached elements from. Can be
+            either a list of tensors or a list of sizes.
         @rtype: list[T] | T
-        @return: Attached elements. If `attach_index` is set to `"all"` or is a slice,
-            returns a list of attached elements.
-
-        @raises ValueError: If the `attach_index` is invalid.
+        @return: Attached elements. If C{attach_index} is set to
+            C{"all"} or is a slice, returns a list of attached elements.
+        @raises ValueError: If the C{attach_index} is invalid.
         """
 
         def _normalize_index(index: int) -> int:
@@ -608,7 +664,9 @@ def _normalize_slice(i: int, j: int) -> slice:
             case (int(i), int(j), int(k)):
                 return lst[i:j:k]
             case _:
-                raise ValueError(f"Invalid attach index: `{self.attach_index}`")
+                raise ValueError(
+                    f"Invalid attach index: `{self.attach_index}`"
+                )
 
     def _get_nth_size(self, idx: int) -> int | list[int]:
         match self.in_sizes:
@@ -617,8 +675,8 @@ def _get_nth_size(self, idx: int) -> int | list[int]:
             case list(sizes):
                 return [size[idx] for size in sizes]
 
-    def _non_set_error(self, name: str) -> ValueError:
-        return ValueError(
-            f"{self.name} is trying to access `{name}`, "
+    def _non_set_error(self, name: str) -> RuntimeError:
+        return RuntimeError(
+            f"'{self.name}' node is trying to access `{name}`, "
             "but it was not set during initialization. "
         )
diff --git a/luxonis_train/nodes/blocks/blocks.py b/luxonis_train/nodes/blocks/blocks.py
index 0e0a4ad2..9231ea85 100644
--- a/luxonis_train/nodes/blocks/blocks.py
+++ b/luxonis_train/nodes/blocks/blocks.py
@@ -1,6 +1,3 @@
-# TODO:  cleanup, document
-# Check if some blocks could be merged togetner.
-
 import math
 from typing import TypeVar
 
@@ -13,7 +10,8 @@
 
 class EfficientDecoupledBlock(nn.Module):
     def __init__(self, n_classes: int, in_channels: int):
-        """Efficient Decoupled block used for class and regression predictions.
+        """Efficient Decoupled block used for class and regression
+        predictions.
 
         @type n_classes: int
         @param n_classes: Number of classes.
@@ -39,7 +37,9 @@ def __init__(self, n_classes: int, in_channels: int):
                 padding=1,
                 activation=nn.SiLU(),
             ),
-            nn.Conv2d(in_channels=in_channels, out_channels=n_classes, kernel_size=1),
+            nn.Conv2d(
+                in_channels=in_channels, out_channels=n_classes, kernel_size=1
+            ),
         )
         self.regression_branch = nn.Sequential(
             ConvModule(
@@ -152,7 +152,10 @@ def __init__(
 
         super().__init__(
             nn.ConvTranspose2d(
-                in_channels, out_channels, kernel_size=kernel_size, stride=stride
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
             ),
             ConvModule(out_channels, out_channels, kernel_size=3, padding=1),
         )
@@ -299,7 +302,9 @@ def forward(self, x: Tensor) -> Tensor:
         else:
             id_out = self.rbr_identity(x)
 
-        return self.nonlinearity(self.se(self.rbr_dense(x) + self.rbr_1x1(x) + id_out))
+        return self.nonlinearity(
+            self.se(self.rbr_dense(x) + self.rbr_1x1(x) + id_out)
+        )
 
     def reparametrize(self) -> None:
         if hasattr(self, "rbr_reparam"):
@@ -318,15 +323,16 @@ def reparametrize(self) -> None:
         )
         self.rbr_reparam.weight.data = kernel  # type: ignore
         self.rbr_reparam.bias.data = bias  # type: ignore
-        self.__delattr__("rbr_dense")
-        self.__delattr__("rbr_1x1")
+        del self.rbr_dense
+        del self.rbr_1x1
         if hasattr(self, "rbr_identity"):
-            self.__delattr__("rbr_identity")
+            del self.rbr_identity
         if hasattr(self, "id_tensor"):
-            self.__delattr__("id_tensor")
+            del self.id_tensor
 
     def _get_equivalent_kernel_bias(self) -> tuple[Tensor, Tensor]:
-        """Derives the equivalent kernel and bias in a DIFFERENTIABLE way."""
+        """Derives the equivalent kernel and bias in a DIFFERENTIABLE
+        way."""
         kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
         kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
         kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
@@ -343,7 +349,9 @@ def _pad_1x1_to_3x3_tensor(self, kernel1x1: Tensor | None) -> Tensor:
         else:
             return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
 
-    def _fuse_bn_tensor(self, branch: nn.Module | None) -> tuple[Tensor, Tensor]:
+    def _fuse_bn_tensor(
+        self, branch: nn.Module | None
+    ) -> tuple[Tensor, Tensor]:
         if branch is None:
             return torch.tensor(0), torch.tensor(0)
         if isinstance(branch, nn.Sequential):
@@ -381,11 +389,11 @@ def __init__(
         block: type[nn.Module],
         in_channels: int,
         out_channels: int,
-        num_blocks: int = 1,
+        n_blocks: int = 1,
     ):
-        """Module which repeats the block n times. First block accepts in_channels and
-        outputs out_channels while subsequent blocks accept out_channels and output
-        out_channels.
+        """Module which repeats the block n times. First block accepts
+        in_channels and outputs out_channels while subsequent blocks
+        accept out_channels and output out_channels.
 
         @type block: L{nn.Module}
         @param block: Block to repeat.
@@ -393,14 +401,14 @@ def __init__(
         @param in_channels: Number of input channels.
         @type out_channels: int
         @param out_channels: Number of output channels.
-        @type num_blocks: int
-        @param num_blocks: Number of blocks to repeat. Defaults to C{1}.
+        @type n_blocks: int
+        @param n_blocks: Number of blocks to repeat. Defaults to C{1}.
         """
         super().__init__()
 
         in_channels = in_channels
         self.blocks = nn.ModuleList()
-        for _ in range(num_blocks):
+        for _ in range(n_blocks):
             self.blocks.append(
                 block(in_channels=in_channels, out_channels=out_channels)
             )
@@ -413,8 +421,11 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class SpatialPyramidPoolingBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int = 5):
-        """Spatial Pyramid Pooling block with ReLU activation on three different scales.
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int = 5
+    ):
+        """Spatial Pyramid Pooling block with ReLU activation on three
+        different scales.
 
         @type in_channels: int
         @param in_channels: Number of input channels.
@@ -476,7 +487,9 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class FeatureFusionBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, reduction: int = 1):
+    def __init__(
+        self, in_channels: int, out_channels: int, reduction: int = 1
+    ):
         """Feature Fusion block adapted from: U{https://github.com/taveraantonio/BiseNetv1}.
 
         @type in_channels: int
@@ -600,19 +613,19 @@ def __init__(
         in_channels: int,
         in_channels_next: int,
         out_channels: int,
-        num_repeats: int,
+        n_repeats: int,
     ):
         """UpBlock used in RepPAN neck.
 
         @type in_channels: int
         @param in_channels: Number of input channels.
         @type in_channels_next: int
-        @param in_channels_next: Number of input channels of next input which is used in
-            concat.
+        @param in_channels_next: Number of input channels of next input
+            which is used in concat.
         @type out_channels: int
         @param out_channels: Number of output channels.
-        @type num_repeats: int
-        @param num_repeats: Number of RepVGGBlock repeats.
+        @type n_repeats: int
+        @param n_repeats: Number of RepVGGBlock repeats.
         """
 
         super().__init__()
@@ -634,7 +647,7 @@ def __init__(
             block=RepVGGBlock,
             in_channels=in_channels_next + out_channels,
             out_channels=out_channels,
-            num_blocks=num_repeats,
+            n_blocks=n_repeats,
         )
 
     def forward(self, x0: Tensor, x1: Tensor) -> tuple[Tensor, Tensor]:
@@ -652,21 +665,22 @@ def __init__(
         downsample_out_channels: int,
         in_channels_next: int,
         out_channels: int,
-        num_repeats: int,
+        n_repeats: int,
     ):
         """DownBlock used in RepPAN neck.
 
         @type in_channels: int
         @param in_channels: Number of input channels.
         @type downsample_out_channels: int
-        @param downsample_out_channels: Number of output channels after downsample.
+        @param downsample_out_channels: Number of output channels after
+            downsample.
         @type in_channels_next: int
-        @param in_channels_next: Number of input channels of next input which is used in
-            concat.
+        @param in_channels_next: Number of input channels of next input
+            which is used in concat.
         @type out_channels: int
         @param out_channels: Number of output channels.
-        @type num_repeats: int
-        @param num_repeats: Number of RepVGGBlock repeats.
+        @type n_repeats: int
+        @param n_repeats: Number of RepVGGBlock repeats.
         """
         super().__init__()
 
@@ -681,7 +695,7 @@ def __init__(
             block=RepVGGBlock,
             in_channels=downsample_out_channels + in_channels_next,
             out_channels=out_channels,
-            num_blocks=num_repeats,
+            n_blocks=n_repeats,
         )
 
     def forward(self, x0: Tensor, x1: Tensor) -> Tensor:
diff --git a/luxonis_train/nodes/heads/bisenet_head.py b/luxonis_train/nodes/heads/bisenet_head.py
index 3fef7584..dd6e6333 100644
--- a/luxonis_train/nodes/heads/bisenet_head.py
+++ b/luxonis_train/nodes/heads/bisenet_head.py
@@ -1,31 +1,28 @@
-"""BiSeNet segmentation head.
-
-Adapted from U{https://github.com/taveraantonio/BiseNetv1}.
-License: NOT SPECIFIED.
-"""
-
+from typing import Any
 
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import ConvModule
-from luxonis_train.utils.general import infer_upscale_factor
-from luxonis_train.utils.types import LabelType, Packet
+from luxonis_train.utils import infer_upscale_factor
 
 
 class BiSeNetHead(BaseNode[Tensor, Tensor]):
     in_height: int
+    in_width: int
     in_channels: int
 
     tasks: list[LabelType] = [LabelType.SEGMENTATION]
 
-    def __init__(
-        self,
-        intermediate_channels: int = 64,
-        **kwargs,
-    ):
+    def __init__(self, intermediate_channels: int = 64, **kwargs: Any):
         """BiSeNet segmentation head.
-        TODO: Add more documentation.
+
+        Source: U{BiseNetV1<https://github.com/taveraantonio/BiseNetv1>}
+        @license: NOT SPECIFIED.
+        @see: U{BiseNetv1: Bilateral Segmentation Network for
+            Real-time Semantic Segmentation
+            <https://arxiv.org/abs/1808.00897>}
 
         @type intermediate_channels: int
         @param intermediate_channels: How many intermediate channels to use.
@@ -33,17 +30,28 @@ def __init__(
         """
         super().__init__(**kwargs)
 
-        original_height = self.original_in_shape[1]
-        upscale_factor = 2 ** infer_upscale_factor(self.in_height, original_height)
+        h, w = self.original_in_shape[1:]
+        upscale_factor = 2 ** infer_upscale_factor(
+            (self.in_height, self.in_width), (h, w)
+        )
         out_channels = self.n_classes * upscale_factor * upscale_factor
 
-        self.conv_3x3 = ConvModule(self.in_channels, intermediate_channels, 3, 1, 1)
-        self.conv_1x1 = nn.Conv2d(intermediate_channels, out_channels, 1, 1, 0)
+        self.conv_3x3 = ConvModule(
+            self.in_channels,
+            intermediate_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.conv_1x1 = nn.Conv2d(
+            intermediate_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
         self.upscale = nn.PixelShuffle(upscale_factor)
 
-    def wrap(self, output: Tensor) -> Packet[Tensor]:
-        return {"segmentation": [output]}
-
     def forward(self, inputs: Tensor) -> Tensor:
         x = self.conv_3x3(inputs)
         x = self.conv_1x1(x)
diff --git a/luxonis_train/nodes/heads/classification_head.py b/luxonis_train/nodes/heads/classification_head.py
index 07b3d72b..5961c853 100644
--- a/luxonis_train/nodes/heads/classification_head.py
+++ b/luxonis_train/nodes/heads/classification_head.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
@@ -8,16 +10,15 @@ class ClassificationHead(BaseNode[Tensor, Tensor]):
     in_channels: int
     tasks: list[LabelType] = [LabelType.CLASSIFICATION]
 
-    def __init__(
-        self,
-        dropout_rate: float = 0.2,
-        **kwargs,
-    ):
+    def __init__(self, dropout_rate: float = 0.2, **kwargs: Any):
         """Simple classification head.
 
+        Consists of a global average pooling layer followed by a dropout
+        layer and a single linear layer.
+
         @type dropout_rate: float
-        @param dropout_rate: Dropout rate before last layer, range C{[0, 1]}. Defaults
-            to C{0.2}.
+        @param dropout_rate: Dropout rate before last layer, range C{[0,
+            1]}. Defaults to C{0.2}.
         """
         super().__init__(**kwargs)
 
diff --git a/luxonis_train/nodes/heads/efficient_bbox_head.py b/luxonis_train/nodes/heads/efficient_bbox_head.py
index 5607a2a8..6f0e01e7 100644
--- a/luxonis_train/nodes/heads/efficient_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_bbox_head.py
@@ -1,22 +1,20 @@
-"""Head for object detection.
-
-Adapted from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial
-Applications<https://arxiv.org/pdf/2209.02976.pdf>}.
-"""
-
-from typing import Literal
+import logging
+from typing import Any, Literal
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import EfficientDecoupledBlock
-from luxonis_train.utils.boxutils import (
+from luxonis_train.utils import (
+    Packet,
     anchors_for_fpn_features,
     dist2bbox,
     non_max_suppression,
 )
-from luxonis_train.utils.types import LabelType, Packet
+
+logger = logging.getLogger(__name__)
 
 
 class EfficientBBoxHead(
@@ -31,24 +29,24 @@ def __init__(
         conf_thres: float = 0.25,
         iou_thres: float = 0.45,
         max_det: int = 300,
-        **kwargs,
+        **kwargs: Any,
     ):
         """Head for object detection.
 
-        TODO: add more documentation
-
+        Adapted from U{YOLOv6: A Single-Stage Object Detection Framework
+        for Industrial Applications
+        <https://arxiv.org/pdf/2209.02976.pdf>}.
         @type n_heads: Literal[2,3,4]
-        @param n_heads: Number of output heads. Defaults to 3.
-          ***Note:*** Should be same also on neck in most cases.
-
+        @param n_heads: Number of output heads. Defaults to 3. B{Note:}
+            Should be same also on neck in most cases.
         @type conf_thres: float
-        @param conf_thres: Threshold for confidence. Defaults to C{0.25}.
-
+        @param conf_thres: Threshold for confidence. Defaults to
+            C{0.25}.
         @type iou_thres: float
         @param iou_thres: Threshold for IoU. Defaults to C{0.45}.
-
         @type max_det: int
-        @param max_det: Maximum number of detections retained after NMS. Defaults to C{300}.
+        @param max_det: Maximum number of detections retained after NMS.
+            Defaults to C{300}.
         """
         super().__init__(**kwargs)
 
@@ -58,11 +56,18 @@ def __init__(
         self.iou_thres = iou_thres
         self.max_det = max_det
 
-        self.stride = self._fit_stride_to_num_heads()
+        self.stride = self._fit_stride_to_n_heads()
         self.grid_cell_offset = 0.5
         self.grid_cell_size = 5.0
 
         self.heads = nn.ModuleList()
+        if len(self.in_channels) < self.n_heads:
+            logger.warning(
+                f"Head '{self.name}' was set to use {self.n_heads} heads, "
+                f"but received only {len(self.in_channels)} inputs. "
+                f"Changing number of heads to {len(self.in_channels)}."
+            )
+            self.n_heads = len(self.in_channels)
         for i in range(self.n_heads):
             curr_head = EfficientDecoupledBlock(
                 n_classes=self.n_classes,
@@ -92,18 +97,25 @@ def wrap(
         features, cls_score_list, reg_distri_list = output
 
         if self.export:
-            outputs = []
-            for out_cls, out_reg in zip(cls_score_list, reg_distri_list, strict=True):
+            outputs: list[Tensor] = []
+            for out_cls, out_reg in zip(
+                cls_score_list, reg_distri_list, strict=True
+            ):
                 conf, _ = out_cls.max(1, keepdim=True)
                 out = torch.cat([out_reg, conf, out_cls], dim=1)
                 outputs.append(out)
             return {self.task: outputs}
 
         cls_tensor = torch.cat(
-            [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))], dim=2
+            [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))],
+            dim=2,
         ).permute(0, 2, 1)
         reg_tensor = torch.cat(
-            [reg_distri_list[i].flatten(2) for i in range(len(reg_distri_list))], dim=2
+            [
+                reg_distri_list[i].flatten(2)
+                for i in range(len(reg_distri_list))
+            ],
+            dim=2,
         ).permute(0, 2, 1)
 
         if self.training:
@@ -122,8 +134,9 @@ def wrap(
                 "distributions": [reg_tensor],
             }
 
-    def _fit_stride_to_num_heads(self):
-        """Returns correct stride for number of heads and attach index."""
+    def _fit_stride_to_n_heads(self):
+        """Returns correct stride for number of heads and attach
+        index."""
         stride = torch.tensor(
             [
                 self.original_in_shape[1] / x[2]  # type: ignore
@@ -136,7 +149,8 @@ def _fit_stride_to_num_heads(self):
     def _process_to_bbox(
         self, output: tuple[list[Tensor], Tensor, Tensor]
     ) -> list[Tensor]:
-        """Performs post-processing of the output and returns bboxs after NMS."""
+        """Performs post-processing of the output and returns bboxs
+        after NMS."""
         features, cls_score_list, reg_dist_list = output
         _, anchor_points, _, stride_tensor = anchors_for_fpn_features(
             features,
@@ -146,7 +160,9 @@ def _process_to_bbox(
             multiply_with_stride=False,
         )
 
-        pred_bboxes = dist2bbox(reg_dist_list, anchor_points, out_format="xyxy")
+        pred_bboxes = dist2bbox(
+            reg_dist_list, anchor_points, out_format="xyxy"
+        )
 
         pred_bboxes *= stride_tensor
         output_merged = torch.cat(
diff --git a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
index 03d29296..51b8b704 100644
--- a/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
+++ b/luxonis_train/nodes/heads/efficient_keypoint_bbox_head.py
@@ -1,15 +1,16 @@
-from typing import Literal
+from typing import Any, Literal
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 
 from luxonis_train.nodes.blocks import ConvModule
-from luxonis_train.utils.boxutils import (
+from luxonis_train.utils import (
+    Packet,
     anchors_for_fpn_features,
     dist2bbox,
     non_max_suppression,
 )
-from luxonis_train.utils.types import LabelType, Packet
 
 from .efficient_bbox_head import EfficientBBoxHead
 
@@ -23,7 +24,7 @@ def __init__(
         conf_thres: float = 0.25,
         iou_thres: float = 0.45,
         max_det: int = 300,
-        **kwargs,
+        **kwargs: Any,
     ):
         """Head for object and keypoint detection.
 
@@ -68,7 +69,12 @@ def forward(
     ) -> tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]:
         features, cls_score_list, reg_distri_list = super().forward(inputs)
 
-        _, self.anchor_points, _, self.stride_tensor = anchors_for_fpn_features(
+        (
+            _,
+            self.anchor_points,
+            _,
+            self.stride_tensor,
+        ) = anchors_for_fpn_features(
             features,
             self.stride,
             self.grid_cell_size,
@@ -84,17 +90,18 @@ def forward(
         return features, cls_score_list, reg_distri_list, kpt_list
 
     def wrap(
-        self, output: tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]]
+        self,
+        output: tuple[list[Tensor], list[Tensor], list[Tensor], list[Tensor]],
     ) -> Packet[Tensor]:
         features, cls_score_list, reg_distri_list, kpt_list = output
         bs = features[0].shape[0]
         if self.export:
-            outputs = []
+            outputs: list[Tensor] = []
             for out_cls, out_reg, out_kpts in zip(
                 cls_score_list, reg_distri_list, kpt_list, strict=True
             ):
-                chunks = out_kpts.split(3, dim=1)
-                modified_chunks = []
+                chunks = torch.split(out_kpts, 3, dim=1)
+                modified_chunks: list[Tensor] = []
                 for chunk in chunks:
                     x = chunk[:, 0:1, :, :]
                     y = chunk[:, 1:2, :, :]
@@ -105,11 +112,17 @@ def wrap(
                 out = torch.cat([out_reg, out_cls, out_kpts_modified], dim=1)
                 outputs.append(out)
             return {"outputs": outputs}
+
         cls_tensor = torch.cat(
-            [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))], dim=2
+            [cls_score_list[i].flatten(2) for i in range(len(cls_score_list))],
+            dim=2,
         ).permute(0, 2, 1)
         reg_tensor = torch.cat(
-            [reg_distri_list[i].flatten(2) for i in range(len(reg_distri_list))], dim=2
+            [
+                reg_distri_list[i].flatten(2)
+                for i in range(len(reg_distri_list))
+            ],
+            dim=2,
         ).permute(0, 2, 1)
         kpt_tensor = torch.cat(
             [
@@ -143,7 +156,7 @@ def wrap(
             "keypoints_raw": [kpt_tensor],
         }
 
-    def _dist2kpts(self, kpts):
+    def _dist2kpts(self, kpts: Tensor) -> Tensor:
         """Decodes keypoints."""
         y = kpts.clone()
 
@@ -154,8 +167,12 @@ def _dist2kpts(self, kpts):
         anchor_points_x = anchor_points_transposed[0].view(1, -1, 1)
         anchor_points_y = anchor_points_transposed[1].view(1, -1, 1)
 
-        y[:, :, 0::3] = (y[:, :, 0::3] * 2.0 + (anchor_points_x - 0.5)) * stride_tensor
-        y[:, :, 1::3] = (y[:, :, 1::3] * 2.0 + (anchor_points_y - 0.5)) * stride_tensor
+        y[:, :, 0::3] = (
+            y[:, :, 0::3] * 2.0 + (anchor_points_x - 0.5)
+        ) * stride_tensor
+        y[:, :, 1::3] = (
+            y[:, :, 1::3] * 2.0 + (anchor_points_y - 0.5)
+        ) * stride_tensor
         y[:, :, 2::3] = y[:, :, 2::3].sigmoid()
 
         return y
@@ -163,10 +180,13 @@ def _dist2kpts(self, kpts):
     def _process_to_bbox_and_kps(
         self, output: tuple[list[Tensor], Tensor, Tensor, Tensor]
     ) -> list[Tensor]:
-        """Performs post-processing of the output and returns bboxs after NMS."""
+        """Performs post-processing of the output and returns bboxs
+        after NMS."""
         features, cls_score_list, reg_dist_list, keypoints = output
 
-        pred_bboxes = dist2bbox(reg_dist_list, self.anchor_points, out_format="xyxy")
+        pred_bboxes = dist2bbox(
+            reg_dist_list, self.anchor_points, out_format="xyxy"
+        )
 
         pred_bboxes *= self.stride_tensor
         output_merged = torch.cat(
diff --git a/luxonis_train/nodes/heads/implicit_keypoint_bbox_head.py b/luxonis_train/nodes/heads/implicit_keypoint_bbox_head.py
index 0ca995c5..5de88650 100644
--- a/luxonis_train/nodes/heads/implicit_keypoint_bbox_head.py
+++ b/luxonis_train/nodes/heads/implicit_keypoint_bbox_head.py
@@ -1,34 +1,38 @@
 import logging
 import math
-from typing import cast
+from typing import Any, cast
 
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import KeypointBlock, LearnableMulAddConv
-from luxonis_train.utils.boxutils import (
+from luxonis_train.utils import (
+    Packet,
     non_max_suppression,
     process_bbox_predictions,
     process_keypoints_predictions,
 )
-from luxonis_train.utils.types import LabelType, Packet
 
 logger = logging.getLogger(__name__)
 
 
-class ImplicitKeypointBBoxHead(BaseNode):
-    tasks: list[LabelType] = [LabelType.KEYPOINTS, LabelType.BOUNDINGBOX]
+class ImplicitKeypointBBoxHead(
+    BaseNode[list[Tensor], tuple[list[Tensor], Tensor]]
+):
+    tasks = [LabelType.KEYPOINTS, LabelType.BOUNDINGBOX]
+    in_channels: list[int]
 
     def __init__(
         self,
-        num_heads: int = 3,
+        n_heads: int = 3,
         anchors: list[list[float]] | None = None,
         init_coco_biases: bool = True,
         conf_thres: float = 0.25,
         iou_thres: float = 0.45,
         max_det: int = 300,
-        **kwargs,
+        **kwargs: Any,
     ):
         """Head for object and keypoint detection.
 
@@ -37,8 +41,8 @@ def __init__(
 
         TODO: more technical documentation
 
-        @type num_heads: int
-        @param num_heads: Number of output heads. Defaults to C{3}.
+        @type n_heads: int
+        @param n_heads: Number of output heads. Defaults to C{3}.
             B{Note:} Should be same also on neck in most cases.
         @type anchors: list[list[float]] | None
         @param anchors: Anchors used for object detection.
@@ -53,16 +57,27 @@ def __init__(
         """
         super().__init__(**kwargs)
 
-        if anchors is None:
-            logger.info("No anchors provided, generating them automatically.")
-            anchors, recall = self.dataset_metadata.autogenerate_anchors(num_heads)
-            logger.info(f"Anchors generated. Best possible recall: {recall:.2f}")
-
         self.conf_thres = conf_thres
         self.iou_thres = iou_thres
         self.max_det = max_det
 
-        self.num_heads = num_heads
+        self.n_heads = n_heads
+        if len(self.in_channels) < self.n_heads:
+            logger.warning(
+                f"Head '{self.name}' was set to use {self.n_heads} heads, "
+                f"but received only {len(self.in_channels)} inputs. "
+                f"Changing number of heads to {len(self.in_channels)}."
+            )
+            self.n_heads = len(self.in_channels)
+
+        if anchors is None:
+            logger.info("No anchors provided, generating them automatically.")
+            anchors, recall = self.dataset_metadata.autogenerate_anchors(
+                self.n_heads
+            )
+            logger.info(
+                f"Anchors generated. Best possible recall: {recall:.2f}"
+            )
 
         self.box_offset = 5
         self.n_det_out = self.n_classes + self.box_offset
@@ -71,13 +86,13 @@ def __init__(
         self.n_anchors = len(anchors[0]) // 2
         self.grid: list[Tensor] = []
 
-        self.anchors = torch.tensor(anchors).float().view(self.num_heads, -1, 2)
-        self.anchor_grid = self.anchors.clone().view(self.num_heads, 1, -1, 1, 1, 2)
-
-        self.channel_list, self.stride = self._fit_to_num_heads(
-            cast(list[int], self.in_channels)
+        self.anchors = torch.tensor(anchors).float().view(self.n_heads, -1, 2)
+        self.anchor_grid = self.anchors.clone().view(
+            self.n_heads, 1, -1, 1, 1, 2
         )
 
+        self.channel_list, self.stride = self._fit_to_n_heads(self.in_channels)
+
         self.learnable_mul_add_conv = nn.ModuleList(
             LearnableMulAddConv(
                 add_channel=in_channels,
@@ -108,7 +123,7 @@ def forward(self, inputs: list[Tensor]) -> tuple[list[Tensor], Tensor]:
 
         self.anchor_grid = self.anchor_grid.to(inputs[0].device)
 
-        for i in range(self.num_heads):
+        for i in range(self.n_heads):
             feat = cast(
                 Tensor,
                 torch.cat(
@@ -123,11 +138,17 @@ def forward(self, inputs: list[Tensor]) -> tuple[list[Tensor], Tensor]:
             batch_size, _, feature_height, feature_width = feat.shape
             if i >= len(self.grid):
                 self.grid.append(
-                    self._construct_grid(feature_width, feature_height).to(feat.device)
+                    self._construct_grid(feature_width, feature_height).to(
+                        feat.device
+                    )
                 )
 
             feat = feat.reshape(
-                batch_size, self.n_anchors, self.n_out, feature_height, feature_width
+                batch_size,
+                self.n_anchors,
+                self.n_out,
+                feature_height,
+                feature_width,
             ).permute(0, 1, 3, 4, 2)
 
             features.append(feat)
@@ -139,8 +160,8 @@ def forward(self, inputs: list[Tensor]) -> tuple[list[Tensor], Tensor]:
 
         return features, torch.cat(predictions, dim=1)
 
-    def wrap(self, outputs: tuple[list[Tensor], Tensor]) -> Packet[Tensor]:
-        features, predictions = outputs
+    def wrap(self, output: tuple[list[Tensor], Tensor]) -> Packet[Tensor]:
+        features, predictions = output
 
         if self.export:
             return {"boxes_and_keypoints": [predictions]}
@@ -160,7 +181,8 @@ def wrap(self, outputs: tuple[list[Tensor], Tensor]) -> Packet[Tensor]:
         return {
             "boundingbox": [detection[:, :6] for detection in nms],
             "keypoints": [
-                detection[:, 6:].reshape(-1, self.n_keypoints, 3) for detection in nms
+                detection[:, 6:].reshape(-1, self.n_keypoints, 3)
+                for detection in nms
             ],
             "features": features,
         }
@@ -169,10 +191,12 @@ def _build_predictions(
         self, feat: Tensor, anchor_grid: Tensor, grid: Tensor, stride: Tensor
     ) -> Tensor:
         batch_size = feat.shape[0]
-        x_bbox = feat[..., : self.box_offset + self.n_classes]
-        x_keypoints = feat[..., self.box_offset + self.n_classes :]
+        bbox = feat[..., : self.box_offset + self.n_classes]
+        keypoints = feat[..., self.box_offset + self.n_classes :]
 
-        box_cxcy, box_wh, box_tail = process_bbox_predictions(x_bbox, anchor_grid)
+        box_cxcy, box_wh, box_tail = process_bbox_predictions(
+            bbox, anchor_grid
+        )
         grid = grid.to(box_cxcy.device)
         stride = stride.to(box_cxcy.device)
         box_cxcy = (box_cxcy + grid) * stride
@@ -180,7 +204,7 @@ def _build_predictions(
 
         grid_x = grid[..., 0:1]
         grid_y = grid[..., 1:2]
-        kpt_x, kpt_y, kpt_vis = process_keypoints_predictions(x_keypoints)
+        kpt_x, kpt_y, kpt_vis = process_keypoints_predictions(keypoints)
         kpt_x = (kpt_x + grid_x) * stride
         kpt_y = (kpt_y + grid_y) * stride
         kpt_vis_sig = kpt_vis.sigmoid()
@@ -200,12 +224,14 @@ def _infer_bbox(
         )
         return torch.cat((out_bbox_xy, out_bbox_wh, out_bbox[..., 4:]), dim=-1)
 
-    def _fit_to_num_heads(self, channel_list: list):
-        out_channel_list = channel_list[: self.num_heads]
+    def _fit_to_n_heads(
+        self, channel_list: list[int]
+    ) -> tuple[list[int], Tensor]:
+        out_channel_list = channel_list[: self.n_heads]
         stride = torch.tensor(
             [
                 self.original_in_shape[1] / h
-                for h in cast(list[int], self.in_height)[: self.num_heads]
+                for h in cast(list[int], self.in_height)[: self.n_heads]
             ],
             dtype=torch.int,
         )
@@ -214,11 +240,15 @@ def _fit_to_num_heads(self, channel_list: list):
     def _initialize_weights_and_biases(self, class_freq: Tensor | None = None):
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+                nn.init.kaiming_normal_(
+                    m.weight, mode="fan_out", nonlinearity="relu"
+                )
             elif isinstance(m, nn.BatchNorm2d):
                 m.eps = 1e-3
                 m.momentum = 0.03
-            elif isinstance(m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6)):
+            elif isinstance(
+                m, (nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6)
+            ):
                 m.inplace = True
 
         for mi, s in zip(self.learnable_mul_add_conv, self.stride):
@@ -233,7 +263,8 @@ def _initialize_weights_and_biases(self, class_freq: Tensor | None = None):
 
     def _construct_grid(self, feature_width: int, feature_height: int):
         grid_y, grid_x = torch.meshgrid(
-            [torch.arange(feature_height), torch.arange(feature_width)], indexing="ij"
+            [torch.arange(feature_height), torch.arange(feature_width)],
+            indexing="ij",
         )
         return (
             torch.stack((grid_x, grid_y), 2)
diff --git a/luxonis_train/nodes/heads/segmentation_head.py b/luxonis_train/nodes/heads/segmentation_head.py
index 1b29df7b..240b956c 100644
--- a/luxonis_train/nodes/heads/segmentation_head.py
+++ b/luxonis_train/nodes/heads/segmentation_head.py
@@ -1,39 +1,33 @@
-"""Implementation of a basic segmentation head.
+from typing import Any
 
-Adapted from: U{https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py}
-@license: U{BSD-3 <https://github.com/pytorch/vision/blob/main/LICENSE>}
-"""
-
-import torch.nn as nn
-from torch import Tensor
+from luxonis_ml.data import LabelType
+from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import UpBlock
-from luxonis_train.utils.general import infer_upscale_factor
-from luxonis_train.utils.types import LabelType
+from luxonis_train.utils import infer_upscale_factor
 
 
 class SegmentationHead(BaseNode[Tensor, Tensor]):
     in_height: int
+    in_width: int
     in_channels: int
+
     tasks: list[LabelType] = [LabelType.SEGMENTATION]
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs: Any):
         """Basic segmentation FCN head.
 
-        Note that it doesn't ensure that ouptut is same size as input.
-
-        @type kwargs: Any
-        @param kwargs: Additional arguments to pass to L{BaseNode}.
+        Adapted from: U{https://github.com/pytorch/vision/blob/main/torchvision/models/segmentation/fcn.py}
+        @license: U{BSD-3 <https://github.com/pytorch/vision/blob/main/LICENSE>}
         """
         super().__init__(**kwargs)
+        h, w = self.original_in_shape[1:]
+        n_up = infer_upscale_factor((self.in_height, self.in_width), (h, w))
 
-        original_height = self.original_in_shape[1]
-        num_up = infer_upscale_factor(self.in_height, original_height, strict=False)
-
-        modules = []
+        modules: list[nn.Module] = []
         in_channels = self.in_channels
-        for _ in range(int(num_up)):
+        for _ in range(int(n_up)):
             modules.append(
                 UpBlock(in_channels=in_channels, out_channels=in_channels // 2)
             )
diff --git a/luxonis_train/nodes/necks/reppan_neck.py b/luxonis_train/nodes/necks/reppan_neck.py
index bd05f083..107151a6 100644
--- a/luxonis_train/nodes/necks/reppan_neck.py
+++ b/luxonis_train/nodes/necks/reppan_neck.py
@@ -1,141 +1,147 @@
-"""Implementation of the RepPANNeck module.
-
-Adapted from U{YOLOv6: A Single-Stage Object Detection Framework for Industrial
-Applications<https://arxiv.org/pdf/2209.02976.pdf>}.
-It has the balance of feature fusion ability and hardware efficiency.
-"""
-
-
-from typing import Literal, cast
+from typing import Any, Literal
 
 from torch import Tensor, nn
 
 from luxonis_train.nodes.base_node import BaseNode
 from luxonis_train.nodes.blocks import RepDownBlock, RepUpBlock
-from luxonis_train.utils.general import make_divisible
+from luxonis_train.utils import make_divisible
 
 
 class RepPANNeck(BaseNode[list[Tensor], list[Tensor]]):
+    in_channels: list[int]
+
     def __init__(
         self,
-        num_heads: Literal[2, 3, 4] = 3,
+        n_heads: Literal[2, 3, 4] = 3,
         channels_list: list[int] | None = None,
-        num_repeats: list[int] | None = None,
+        n_repeats: list[int] | None = None,
         depth_mul: float = 0.33,
         width_mul: float = 0.25,
-        **kwargs,
+        **kwargs: Any,
     ):
-        """Constructor for the RepPANNeck module.
+        """Implementation of the RepPANNeck module.
+
+        Adapted from U{YOLOv6: A Single-Stage Object Detection Framework
+        for Industrial Applications<https://arxiv.org/pdf/2209.02976.pdf>}.
+        It has the balance of feature fusion ability and hardware efficiency.
 
-        @type num_heads: Literal[2,3,4]
-        @param num_heads: Number of output heads. Defaults to 3. ***Note: Should be same
-            also on head in most cases.***
+        @type n_heads: Literal[2,3,4]
+        @param n_heads: Number of output heads. Defaults to 3. B{Note: Should be same
+            also on head in most cases.}
         @type channels_list: list[int] | None
-        @param channels_list: List of number of channels for each block. Defaults to
-            C{[256, 128, 128, 256, 256, 512]}.
-        @type num_repeats: list[int] | None
-        @param num_repeats: List of number of repeats of RepVGGBlock. Defaults to C{[12,
-            12, 12, 12]}.
+        @param channels_list: List of number of channels for each block.
+            Defaults to C{[256, 128, 128, 256, 256, 512]}.
+        @type n_repeats: list[int] | None
+        @param n_repeats: List of number of repeats of RepVGGBlock.
+            Defaults to C{[12, 12, 12, 12]}.
         @type depth_mul: float
-        @param depth_mul: Depth multiplier. Defaults to 0.33.
+        @param depth_mul: Depth multiplier. Defaults to C{0.33}.
         @type width_mul: float
-        @param width_mul: Width multiplier. Defaults to 0.25.
+        @param width_mul: Width multiplier. Defaults to C{0.25}.
         """
 
         super().__init__(**kwargs)
 
-        num_repeats = num_repeats or [12, 12, 12, 12]
-        channels_list = channels_list or [256, 128, 128, 256, 256, 512]
+        self.n_heads = n_heads
 
-        self.num_heads = num_heads
+        n_repeats = n_repeats or [12, 12, 12, 12]
+        channels_list = channels_list or [256, 128, 128, 256, 256, 512]
 
-        channels_list = [make_divisible(ch * width_mul, 8) for ch in channels_list]
-        num_repeats = [
-            (max(round(i * depth_mul), 1) if i > 1 else i) for i in num_repeats
+        channels_list = [
+            make_divisible(ch * width_mul, 8) for ch in channels_list
         ]
-        channels_list, num_repeats = self._fit_to_num_heads(channels_list, num_repeats)
+        n_repeats = [
+            (max(round(i * depth_mul), 1) if i > 1 else i) for i in n_repeats
+        ]
+        channels_list, n_repeats = self._fit_to_n_heads(
+            channels_list, n_repeats
+        )
 
         self.up_blocks = nn.ModuleList()
 
-        in_channels = cast(list[int], self.in_channels)[-1]
+        in_channels = self.in_channels[-1]
         out_channels = channels_list[0]
-        in_channels_next = cast(list[int], self.in_channels)[-2]
-        curr_num_repeats = num_repeats[0]
+        in_channels_next = self.in_channels[-2]
+        curr_n_repeats = n_repeats[0]
         up_out_channel_list = [in_channels]  # used in DownBlocks
 
-        for i in range(1, num_heads):
+        for i in range(1, n_heads):
             curr_up_block = RepUpBlock(
                 in_channels=in_channels,
                 in_channels_next=in_channels_next,
                 out_channels=out_channels,
-                num_repeats=curr_num_repeats,
+                n_repeats=curr_n_repeats,
             )
             up_out_channel_list.append(out_channels)
             self.up_blocks.append(curr_up_block)
-            if len(self.up_blocks) == (num_heads - 1):
+            if len(self.up_blocks) == (n_heads - 1):
                 up_out_channel_list.reverse()
                 break
 
             in_channels = out_channels
             out_channels = channels_list[i]
-            in_channels_next = cast(list[int], self.in_channels)[-1 - (i + 1)]
-            curr_num_repeats = num_repeats[i]
+            in_channels_next = self.in_channels[-1 - (i + 1)]
+            curr_n_repeats = n_repeats[i]
 
         self.down_blocks = nn.ModuleList()
-        channels_list_down_blocks = channels_list[(num_heads - 1) :]
-        num_repeats_down_blocks = num_repeats[(num_heads - 1) :]
+        channels_list_down_blocks = channels_list[(n_heads - 1) :]
+        n_repeats_down_blocks = n_repeats[(n_heads - 1) :]
 
         in_channels = out_channels
         downsample_out_channels = channels_list_down_blocks[0]
         in_channels_next = up_out_channel_list[0]
         out_channels = channels_list_down_blocks[1]
-        curr_num_repeats = num_repeats_down_blocks[0]
+        curr_n_repeats = n_repeats_down_blocks[0]
 
-        for i in range(1, num_heads):
+        for i in range(1, n_heads):
             curr_down_block = RepDownBlock(
                 in_channels=in_channels,
                 downsample_out_channels=downsample_out_channels,
                 in_channels_next=in_channels_next,
                 out_channels=out_channels,
-                num_repeats=curr_num_repeats,
+                n_repeats=curr_n_repeats,
             )
             self.down_blocks.append(curr_down_block)
-            if len(self.down_blocks) == (num_heads - 1):
+            if len(self.down_blocks) == (n_heads - 1):
                 break
 
             in_channels = out_channels
             downsample_out_channels = channels_list_down_blocks[2 * i]
             in_channels_next = up_out_channel_list[i]
             out_channels = channels_list_down_blocks[2 * i + 1]
-            curr_num_repeats = num_repeats_down_blocks[i]
+            curr_n_repeats = n_repeats_down_blocks[i]
 
     def forward(self, inputs: list[Tensor]) -> list[Tensor]:
-        x0 = inputs[-1]
-        up_block_outs = []
-        for i, up_block in enumerate(self.up_blocks):
-            conv_out, x0 = up_block(x0, inputs[-1 - (i + 1)])
+        x = inputs[-1]
+        up_block_outs: list[Tensor] = []
+        for up_block, input_ in zip(
+            self.up_blocks, inputs[-2::-1], strict=False
+        ):
+            conv_out, x = up_block(x, input_)
             up_block_outs.append(conv_out)
-        up_block_outs.reverse()
 
-        outs = [x0]
-        for i, down_block in enumerate(self.down_blocks):
-            x0 = down_block(x0, up_block_outs[i])
-            outs.append(x0)
+        outs = [x]
+        for down_block, up_out in zip(
+            self.down_blocks, reversed(up_block_outs)
+        ):
+            x = down_block(x, up_out)
+            outs.append(x)
         return outs
 
-    def _fit_to_num_heads(
-        self, channels_list: list[int], num_repeats: list[int]
+    def _fit_to_n_heads(
+        self, channels_list: list[int], n_repeats: list[int]
     ) -> tuple[list[int], list[int]]:
-        """Fits channels_list and num_repeats to num_heads by removing or adding items.
+        """Fits channels_list and n_repeats to n_heads by removing or
+        adding items.
 
         Also scales the numbers based on offset
         """
-        if self.num_heads == 3:
-            ...
-        elif self.num_heads == 2:
-            channels_list = [channels_list[0], channels_list[4], channels_list[5]]
-            num_repeats = [num_repeats[0], num_repeats[3]]
-        elif self.num_heads == 4:
+        if self.n_heads == 2:
+            channels_list = [channels_list[i] for i in [0, 4, 5]]
+            n_repeats = [n_repeats[0], n_repeats[3]]
+        elif self.n_heads == 3:
+            return channels_list, n_repeats
+        elif self.n_heads == 4:
             channels_list = [
                 channels_list[0],
                 channels_list[1],
@@ -147,17 +153,11 @@ def _fit_to_num_heads(
                 channels_list[4],
                 channels_list[5],
             ]
-            num_repeats = [
-                num_repeats[0],
-                num_repeats[1],
-                num_repeats[1],
-                num_repeats[2],
-                num_repeats[2],
-                num_repeats[3],
-            ]
+            n_repeats = [n_repeats[i] for i in [0, 1, 1, 2, 2, 3]]
         else:
             raise ValueError(
-                f"Specified number of heads ({self.num_heads}) not supported."
+                f"Specified number of heads ({self.n_heads}) not supported."
+                "The number of heads should be 2, 3 or 4."
             )
 
-        return channels_list, num_repeats
+        return channels_list, n_repeats
diff --git a/luxonis_train/optimizers/__init__.py b/luxonis_train/optimizers/__init__.py
new file mode 100644
index 00000000..acd73792
--- /dev/null
+++ b/luxonis_train/optimizers/__init__.py
@@ -0,0 +1 @@
+from .optimizers import *
diff --git a/luxonis_train/utils/optimizers.py b/luxonis_train/optimizers/optimizers.py
similarity index 92%
rename from luxonis_train/utils/optimizers.py
rename to luxonis_train/optimizers/optimizers.py
index 7583cef9..c2a4bf12 100644
--- a/luxonis_train/utils/optimizers.py
+++ b/luxonis_train/optimizers/optimizers.py
@@ -1,4 +1,4 @@
-from torch import optim
+import torch.optim as optim
 
 from luxonis_train.utils.registry import OPTIMIZERS
 
diff --git a/luxonis_train/schedulers/__init__.py b/luxonis_train/schedulers/__init__.py
new file mode 100644
index 00000000..99bcd9d9
--- /dev/null
+++ b/luxonis_train/schedulers/__init__.py
@@ -0,0 +1 @@
+from .schedulers import *
diff --git a/luxonis_train/utils/schedulers.py b/luxonis_train/schedulers/schedulers.py
similarity index 100%
rename from luxonis_train/utils/schedulers.py
rename to luxonis_train/schedulers/schedulers.py
diff --git a/luxonis_train/utils/__init__.py b/luxonis_train/utils/__init__.py
index 609304c3..c47d3d33 100644
--- a/luxonis_train/utils/__init__.py
+++ b/luxonis_train/utils/__init__.py
@@ -1,5 +1,52 @@
-from .assigners import *
-from .config import *
-from .loaders import *
-from .optimizers import *
-from .schedulers import *
+from .boundingbox import (
+    anchors_for_fpn_features,
+    anchors_from_dataset,
+    bbox2dist,
+    bbox_iou,
+    compute_iou_loss,
+    dist2bbox,
+    match_to_anchor,
+    non_max_suppression,
+    process_bbox_predictions,
+)
+from .config import Config
+from .dataset_metadata import DatasetMetadata
+from .exceptions import IncompatibleException
+from .general import (
+    get_with_default,
+    infer_upscale_factor,
+    make_divisible,
+    to_shape_packet,
+)
+from .graph import is_acyclic, traverse_graph
+from .keypoints import get_sigmas, process_keypoints_predictions
+from .tracker import LuxonisTrackerPL
+from .types import AttachIndexType, Kwargs, Labels, Packet
+
+__all__ = [
+    "Config",
+    "AttachIndexType",
+    "Kwargs",
+    "Labels",
+    "Packet",
+    "IncompatibleException",
+    "DatasetMetadata",
+    "make_divisible",
+    "infer_upscale_factor",
+    "to_shape_packet",
+    "get_with_default",
+    "LuxonisTrackerPL",
+    "match_to_anchor",
+    "dist2bbox",
+    "bbox2dist",
+    "bbox_iou",
+    "non_max_suppression",
+    "anchors_from_dataset",
+    "anchors_for_fpn_features",
+    "process_bbox_predictions",
+    "compute_iou_loss",
+    "process_keypoints_predictions",
+    "get_sigmas",
+    "is_acyclic",
+    "traverse_graph",
+]
diff --git a/luxonis_train/utils/boxutils.py b/luxonis_train/utils/boundingbox.py
similarity index 87%
rename from luxonis_train/utils/boxutils.py
rename to luxonis_train/utils/boundingbox.py
index 3a206c75..9b97bfe6 100644
--- a/luxonis_train/utils/boxutils.py
+++ b/luxonis_train/utils/boundingbox.py
@@ -1,12 +1,10 @@
-"""This module contains various utility functions for working with bounding boxes."""
-
 import math
 from typing import Literal, TypeAlias
 
 import torch
+from luxonis_ml.data import LabelType
 from scipy.cluster.vq import kmeans
 from torch import Tensor
-from torch.utils.data import DataLoader
 from torchvision.ops import (
     batched_nms,
     box_convert,
@@ -15,24 +13,11 @@
     generalized_box_iou,
 )
 
-from luxonis_train.utils.types import LabelType
+from luxonis_train.loaders import BaseLoaderTorch
 
 IoUType: TypeAlias = Literal["none", "giou", "diou", "ciou", "siou"]
 BBoxFormatType: TypeAlias = Literal["xyxy", "xywh", "cxcywh"]
 
-__all__ = [
-    "anchors_for_fpn_features",
-    "anchors_from_dataset",
-    "bbox2dist",
-    "bbox_iou",
-    "compute_iou_loss",
-    "dist2bbox",
-    "match_to_anchor",
-    "non_max_suppression",
-    "process_bbox_predictions",
-    "process_keypoints_predictions",
-]
-
 
 def match_to_anchor(
     targets: Tensor,
@@ -178,8 +163,21 @@ def bbox_iou(
     @param bbox2: Second set of bboxes [M, 4].
     @type bbox_format: BBoxFormatType
     @param bbox_format: Input bbox format. Defaults to "xyxy".
-    @type iou_type: IoUType
+    @type iou_type: Literal["none", "giou", "diou", "ciou", "siou"]
     @param iou_type: IoU type. Defaults to "none".
+        Possible values are:
+            - "none": standard IoU
+            - "giou": Generalized IoU
+            - "diou": Distance IoU
+            - "ciou": Complete IoU. Introduced in U{
+                Enhancing Geometric Factors in Model Learning and
+                Inference for Object Detection and Instance
+                Segmentation<https://arxiv.org/pdf/2005.03572.pdf>}.
+                Implementation adapted from torchvision C{complete_box_iou}
+                with improved stability.
+            - "siou": Soft IoU. Introduced in U{
+                SIoU Loss: More Powerful Learning for Bounding Box
+                Regression<https://arxiv.org/pdf/2205.12740.pdf>}.
     @type element_wise: bool
     @param element_wise: If True returns element wise IoUs. Defaults to False.
     @rtype: Tensor
@@ -197,9 +195,6 @@ def bbox_iou(
     elif iou_type == "diou":
         iou = distance_box_iou(bbox1, bbox2)
     elif iou_type == "ciou":
-        # CIoU from `Enhancing Geometric Factors in Model Learning and Inference for
-        # Object Detection and Instance Segmentation`, https://arxiv.org/pdf/2005.03572.pdf.
-        # Implementation adapted from torchvision complete_box_iou with added eps for stability
         eps = 1e-7
 
         iou = bbox_iou(bbox1, bbox2, iou_type="none")
@@ -218,9 +213,6 @@ def bbox_iou(
         iou = diou - alpha * v
 
     elif iou_type == "siou":
-        # SIoU from `SIoU Loss: More Powerful Learning for Bounding Box Regression`,
-        # https://arxiv.org/pdf/2205.12740.pdf
-
         eps = 1e-7
         bbox1_xywh = box_convert(bbox1, in_fmt="xyxy", out_fmt="xywh")
         w1, h1 = bbox1_xywh[:, 2], bbox1_xywh[:, 3]
@@ -247,7 +239,9 @@ def bbox_iou(
         sin_alpha_1 = torch.abs(s_cw) / sigma
         sin_alpha_2 = torch.abs(s_ch) / sigma
         threshold = pow(2, 0.5) / 2
-        sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
+        sin_alpha = torch.where(
+            sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1
+        )
         angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2)
 
         # distance cost
@@ -287,7 +281,8 @@ def non_max_suppression(
     max_det: int = 300,
     predicts_objectness: bool = True,
 ) -> list[Tensor]:
-    """Non-maximum suppression on model's predictions to keep only best instances.
+    """Non-maximum suppression on model's predictions to keep only best
+    instances.
 
     @type preds: Tensor
     @param preds: Model's prediction tensor of shape [bs, N, M].
@@ -340,7 +335,9 @@ def non_max_suppression(
             torch.max(preds[..., 5 : 5 + n_classes], dim=-1)[0] > conf_thres,
         )
 
-    output = [torch.zeros((0, preds.size(-1)), device=preds.device)] * preds.size(0)
+    output = [
+        torch.zeros((0, preds.size(-1)), device=preds.device)
+    ] * preds.size(0)
 
     for i, x in enumerate(preds):
         curr_out = x[candidate_mask[i]]
@@ -363,7 +360,9 @@ def non_max_suppression(
 
         if multi_label:
             box_idx, class_idx = (
-                (curr_out[:, 5 : 5 + n_classes] > conf_thres).nonzero(as_tuple=False).T
+                (curr_out[:, 5 : 5 + n_classes] > conf_thres)
+                .nonzero(as_tuple=False)
+                .T
             )
             keep_mask[box_idx] = True
             curr_out = torch.cat(
@@ -375,9 +374,13 @@ def non_max_suppression(
                 1,
             )
         else:
-            conf, class_idx = curr_out[:, 5 : 5 + n_classes].max(1, keepdim=True)
+            conf, class_idx = curr_out[:, 5 : 5 + n_classes].max(
+                1, keepdim=True
+            )
             keep_mask[conf.view(-1) > conf_thres] = True
-            curr_out = torch.cat((bboxes, conf, class_idx.float()), 1)[keep_mask]
+            curr_out = torch.cat((bboxes, conf, class_idx.float()), 1)[
+                keep_mask
+            ]
 
         if has_additional:
             curr_out = torch.hstack(
@@ -409,41 +412,37 @@ def non_max_suppression(
 
 
 def anchors_from_dataset(
-    loader: DataLoader,
+    loader: BaseLoaderTorch,
     n_anchors: int = 9,
     n_generations: int = 1000,
     ratio_threshold: float = 4.0,
 ) -> tuple[Tensor, float]:
-    """Generates anchors based on bounding box annotations present in provided data
-    loader. It uses K-Means for initial proposals which are then refined with genetic
-    algorithm.
+    """Generates anchors based on bounding box annotations present in
+    provided data loader. It uses K-Means for initial proposals which
+    are then refined with genetic algorithm.
 
     @type loader: L{torch.utils.data.DataLoader}
     @param loader: Data loader.
     @type n_anchors: int
-    @param n_anchors: Number of anchors, this is normally num_heads * 3 which generates
-        3 anchors per layer. Defaults to 9.
+    @param n_anchors: Number of anchors, this is normally n_heads * 3
+        which generates 3 anchors per layer. Defaults to 9.
     @type n_generations: int
-    @param n_generations: Number of iterations for anchor improvement with genetic
-        algorithm. Defaults to 1000.
+    @param n_generations: Number of iterations for anchor improvement
+        with genetic algorithm. Defaults to 1000.
     @type ratio_threshold: float
-    @param ratio_threshold: Minimum threshold for ratio. Defaults to 4.0.
+    @param ratio_threshold: Minimum threshold for ratio. Defaults to
+        4.0.
     @rtype: tuple[Tensor, float]
     @return: Proposed anchors and the best possible recall.
     """
 
-    widths = []
-    inputs = None
-    for inp, labels in loader:
+    widths: list[Tensor] = []
+    for _, labels in loader:
         for tensor, label_type in labels.values():
             if label_type == LabelType.BOUNDINGBOX:
                 curr_wh = tensor[:, 4:]
                 widths.append(curr_wh)
-        inputs = inp
-    assert inputs is not None, "No inputs found in data loader"
-    _, _, h, w = inputs[
-        loader.dataset.image_source  # type: ignore
-    ].shape  # assuming all images are same size
+    _, h, w = loader.input_shape
     img_size = torch.tensor([w, h])
     wh = torch.vstack(widths) * img_size
 
@@ -463,7 +462,8 @@ def anchors_from_dataset(
     except Exception:
         print("Fallback to random anchor init")
         proposed_anchors = (
-            torch.sort(torch.rand(n_anchors * 2))[0].reshape(n_anchors, 2) * img_size
+            torch.sort(torch.rand(n_anchors * 2))[0].reshape(n_anchors, 2)
+            * img_size
         )
 
     proposed_anchors = proposed_anchors[
@@ -471,7 +471,8 @@ def anchors_from_dataset(
     ]  # sort small to large
 
     def calc_best_anchor_ratio(anchors: Tensor, wh: Tensor) -> Tensor:
-        """Calculate how well most suitable anchor box matches each target bbox."""
+        """Calculate how well most suitable anchor box matches each
+        target bbox."""
         symmetric_size_ratios = torch.min(
             wh[:, None] / anchors[None], anchors[None] / wh[:, None]
         )
@@ -480,17 +481,20 @@ def calc_best_anchor_ratio(anchors: Tensor, wh: Tensor) -> Tensor:
         return best_anchor_ratio
 
     def calc_best_possible_recall(anchors: Tensor, wh: Tensor) -> Tensor:
-        """Calculate best possible recall if every bbox is matched to an appropriate
-        anchor."""
+        """Calculate best possible recall if every bbox is matched to an
+        appropriate anchor."""
         best_anchor_ratio = calc_best_anchor_ratio(anchors, wh)
-        best_possible_recall = (best_anchor_ratio > 1 / ratio_threshold).float().mean()
+        best_possible_recall = (
+            (best_anchor_ratio > 1 / ratio_threshold).float().mean()
+        )
         return best_possible_recall
 
     def anchor_fitness(anchors: Tensor, wh: Tensor) -> Tensor:
         """Fitness function used for anchor evolve."""
         best_anchor_ratio = calc_best_anchor_ratio(anchors, wh)
         return (
-            best_anchor_ratio * (best_anchor_ratio > 1 / ratio_threshold).float()
+            best_anchor_ratio
+            * (best_anchor_ratio > 1 / ratio_threshold).float()
         ).mean()
 
     # Genetic algorithm
@@ -508,7 +512,9 @@ def anchor_fitness(anchors: Tensor, wh: Tensor) -> Tensor:
             + mutation_noise_mean
         ).clip(0.3, 3.0)
 
-        mutated_anchors = (proposed_anchors.clone() * anchor_mutation).clip(min=2.0)
+        mutated_anchors = (proposed_anchors.clone() * anchor_mutation).clip(
+            min=2.0
+        )
         mutated_fitness = anchor_fitness(mutated_anchors, wh)
         if mutated_fitness > best_fitness:
             best_fitness = mutated_fitness
@@ -529,20 +535,22 @@ def anchors_for_fpn_features(
     grid_cell_offset: float = 0.5,
     multiply_with_stride: bool = False,
 ) -> tuple[Tensor, Tensor, list[int], Tensor]:
-    """Generates anchor boxes, points and strides based on FPN feature shapes and
-    strides.
+    """Generates anchor boxes, points and strides based on FPN feature
+    shapes and strides.
 
     @type features: list[Tensor]
     @param features: List of FPN features.
     @type strides: Tensor
     @param strides: Strides of FPN features.
     @type grid_cell_size: float
-    @param grid_cell_size: Cell size in respect to input image size. Defaults to 5.0.
+    @param grid_cell_size: Cell size in respect to input image size.
+        Defaults to 5.0.
     @type grid_cell_offset: float
-    @param grid_cell_offset: Percent grid cell center's offset. Defaults to 0.5.
+    @param grid_cell_offset: Percent grid cell center's offset. Defaults
+        to 0.5.
     @type multiply_with_stride: bool
-    @param multiply_with_stride: Whether to multiply per FPN values with its stride.
-        Defaults to False.
+    @param multiply_with_stride: Whether to multiply per FPN values with
+        its stride. Defaults to False.
     @rtype: tuple[Tensor, Tensor, list[int], Tensor]
     @return: BBox anchors, center anchors, number of anchors, strides
     """
@@ -576,7 +584,9 @@ def anchors_for_fpn_features(
         anchors.append(anchor)
 
         anchor_point = (
-            torch.stack([shift_x, shift_y], dim=-1).reshape(-1, 2).to(feature.dtype)
+            torch.stack([shift_x, shift_y], dim=-1)
+            .reshape(-1, 2)
+            .to(feature.dtype)
         )
         anchor_points.append(anchor_point)
 
@@ -595,26 +605,6 @@ def anchors_for_fpn_features(
     )
 
 
-def process_keypoints_predictions(keypoints: Tensor) -> tuple[Tensor, Tensor, Tensor]:
-    """Extracts x, y and visibility from keypoints predictions.
-
-    @type keypoints: Tensor
-    @param keypoints: Keypoints predictions. The last dimension must be divisible by 3
-        and is expected to be in format [x1, y1, v1, x2, y2, v2, ...].
-
-    @rtype: tuple[Tensor, Tensor, Tensor]
-    @return: x, y and visibility tensors.
-    """
-    x = keypoints[..., ::3] * 2.0 - 0.5
-    y = keypoints[..., 1::3] * 2.0 - 0.5
-    visibility = keypoints[..., 2::3]
-    return (
-        x,
-        y,
-        visibility,
-    )
-
-
 def process_bbox_predictions(
     bbox: Tensor, anchor: Tensor
 ) -> tuple[Tensor, Tensor, Tensor]:
@@ -625,7 +615,8 @@ def process_bbox_predictions(
     @type anchor: Tensor
     @param anchor: Anchor boxes
     @rtype: tuple[Tensor, Tensor, Tensor]
-    @return: xy and wh predictions and tail. The tail is anything after xywh.
+    @return: xy and wh predictions and tail. The tail is anything after
+        xywh.
     """
     out_bbox = bbox.sigmoid()
     out_bbox_xy = out_bbox[..., 0:2] * 2.0 - 0.5
@@ -681,10 +672,12 @@ def compute_iou_loss(
         else:
             bbox_mask = torch.ones_like(pred_bboxes, dtype=torch.bool)
 
-        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape([-1, 4])
-        target_bboxes_pos = torch.masked_select(target_bboxes, bbox_mask).reshape(
+        pred_bboxes_pos = torch.masked_select(pred_bboxes, bbox_mask).reshape(
             [-1, 4]
         )
+        target_bboxes_pos = torch.masked_select(
+            target_bboxes, bbox_mask
+        ).reshape([-1, 4])
 
         iou = bbox_iou(
             pred_bboxes_pos,
diff --git a/luxonis_train/utils/config.py b/luxonis_train/utils/config.py
index 31e4fe5b..b94f08a5 100644
--- a/luxonis_train/utils/config.py
+++ b/luxonis_train/utils/config.py
@@ -10,8 +10,13 @@
     LuxonisConfig,
     LuxonisFileSystem,
 )
-from pydantic import Field, field_validator, model_validator
-from pydantic.types import FilePath, NonNegativeFloat, NonNegativeInt, PositiveInt
+from pydantic import AliasChoices, Field, field_validator, model_validator
+from pydantic.types import (
+    FilePath,
+    NonNegativeFloat,
+    NonNegativeInt,
+    PositiveInt,
+)
 from typing_extensions import Self
 
 logger = logging.getLogger(__name__)
@@ -82,7 +87,9 @@ def check_predefined_model(self) -> Self:
         from luxonis_train.utils.registry import MODELS
 
         if self.predefined_model:
-            logger.info(f"Using predefined model: `{self.predefined_model.name}`")
+            logger.info(
+                f"Using predefined model: `{self.predefined_model.name}`"
+            )
             model = MODELS.get(self.predefined_model.name)(
                 **self.predefined_model.params
             )
@@ -122,14 +129,16 @@ def check_main_metric(self) -> Self:
 
     @model_validator(mode="after")
     def check_graph(self) -> Self:
-        from luxonis_train.utils.general import is_acyclic
+        from luxonis_train.utils import is_acyclic
 
         graph = {node.alias or node.name: node.inputs for node in self.nodes}
         if not is_acyclic(graph):
             raise ValueError("Model graph is not acyclic.")
         if not self.outputs:
             outputs: list[str] = []  # nodes which are not inputs to any nodes
-            inputs = set(node_name for node in self.nodes for node_name in node.inputs)
+            inputs = set(
+                node_name for node in self.nodes for node_name in node.inputs
+            )
             for node in self.nodes:
                 name = node.alias or node.name
                 if name not in inputs:
@@ -147,7 +156,7 @@ def check_unique_names(self) -> Self:
             ("metrics", self.metrics),
             ("visualizers", self.visualizers),
         ]:
-            names = set()
+            names: set[str] = set()
             for obj in objects:
                 obj: AttachedModuleConfig
                 name = obj.alias or obj.name
@@ -232,7 +241,9 @@ class PreprocessingConfig(BaseModelExtraForbid):
     def check_normalize(self) -> Self:
         if self.normalize.active:
             self.augmentations.append(
-                AugmentationConfig(name="Normalize", params=self.normalize.params)
+                AugmentationConfig(
+                    name="Normalize", params=self.normalize.params
+                )
             )
         return self
 
@@ -268,20 +279,34 @@ class TrainerConfig(BaseModelExtraForbid):
     accelerator: Literal["auto", "cpu", "gpu", "tpu"] = "auto"
     devices: int | list[int] | str = "auto"
     strategy: Literal["auto", "ddp"] = "auto"
-    num_sanity_val_steps: int = 2
+    n_sanity_val_steps: Annotated[
+        int,
+        Field(
+            validation_alias=AliasChoices(
+                "n_sanity_val_steps", "num_sanity_val_steps"
+            )
+        ),
+    ] = 2
     profiler: Literal["simple", "advanced"] | None = None
     matmul_precision: Literal["medium", "high", "highest"] | None = None
     verbose: bool = True
 
     seed: int | None = None
+    deterministic: bool | Literal["warn"] | None = None
     batch_size: PositiveInt = 32
     accumulate_grad_batches: PositiveInt = 1
     use_weighted_sampler: bool = False
     epochs: PositiveInt = 100
-    num_workers: NonNegativeInt = 4
+    n_workers: Annotated[
+        NonNegativeInt,
+        Field(validation_alias=AliasChoices("n_workers", "num_workers")),
+    ] = 4
     train_metrics_interval: Literal[-1] | PositiveInt = -1
     validation_interval: Literal[-1] | PositiveInt = 5
-    num_log_images: NonNegativeInt = 4
+    n_log_images: Annotated[
+        NonNegativeInt,
+        Field(validation_alias=AliasChoices("n_log_images", "num_log_images")),
+    ] = 4
     skip_last_batch: bool = True
     pin_memory: bool = True
     log_sub_losses: bool = True
@@ -293,13 +318,24 @@ class TrainerConfig(BaseModelExtraForbid):
     scheduler: SchedulerConfig = SchedulerConfig()
 
     @model_validator(mode="after")
-    def check_num_workes_platform(self) -> Self:
+    def validate_deterministic(self) -> Self:
+        if self.seed is not None and self.deterministic is None:
+            logger.warning(
+                "Setting `trainer.deterministic` to True because `trainer.seed` is set."
+                "This can cause certain layers to fail. "
+                "In such cases, set `trainer.deterministic` to `'warn'`."
+            )
+            self.deterministic = True
+        return self
+
+    @model_validator(mode="after")
+    def check_n_workes_platform(self) -> Self:
         if (
             sys.platform == "win32" or sys.platform == "darwin"
-        ) and self.num_workers != 0:
-            self.num_workers = 0
+        ) and self.n_workers != 0:
+            self.n_workers = 0
             logger.warning(
-                "Setting `num_workers` to 0 because of platform compatibility."
+                "Setting `n_workers` to 0 because of platform compatibility."
             )
         return self
 
@@ -321,7 +357,9 @@ class OnnxExportConfig(BaseModelExtraForbid):
 class BlobconverterExportConfig(BaseModelExtraForbid):
     active: bool = False
     shaves: int = 6
-    version: Literal["2021.2", "2021.3", "2021.4", "2022.1", "2022.3_RVC3"] = "2022.1"
+    version: Literal["2021.2", "2021.3", "2021.4", "2022.1", "2022.3_RVC3"] = (
+        "2022.1"
+    )
 
 
 class ArchiveConfig(BaseModelExtraForbid):
@@ -403,7 +441,9 @@ def get_config(
             return instance
         fs = LuxonisFileSystem(cfg)
         if fs.is_mlflow:
-            logger.info("Setting `project_id` and `run_id` to config's MLFlow run")
+            logger.info(
+                "Setting `project_id` and `run_id` to config's MLFlow run"
+            )
             instance.tracker.project_id = fs.experiment_id
             instance.tracker.run_id = fs.run_id
         return instance
diff --git a/luxonis_train/utils/dataset_metadata.py b/luxonis_train/utils/dataset_metadata.py
new file mode 100644
index 00000000..35ebbef8
--- /dev/null
+++ b/luxonis_train/utils/dataset_metadata.py
@@ -0,0 +1,154 @@
+from luxonis_train.loaders import BaseLoaderTorch
+from luxonis_train.utils import anchors_from_dataset
+
+
+class DatasetMetadata:
+    """Metadata about the dataset."""
+
+    def __init__(
+        self,
+        *,
+        classes: dict[str, list[str]] | None = None,
+        n_keypoints: dict[str, int] | None = None,
+        loader: BaseLoaderTorch | None = None,
+    ):
+        """An object containing metadata about the dataset. Used to
+        infer the number of classes, number of keypoints, I{etc.}
+        instead of passing them as arguments to the model.
+
+        @type classes: dict[str, list[str]] | None
+        @param classes: Dictionary mapping tasks to lists of class
+            names.
+        @type n_keypoints: dict[str, int] | None
+        @param n_keypoints: Dictionary mapping tasks to the number of
+            keypoints.
+        @type loader: DataLoader | None
+        @param loader: Dataset loader.
+        """
+        self._classes = classes or {}
+        self._n_keypoints = n_keypoints or {}
+        self._loader = loader
+
+    def n_classes(self, task: str | None = None) -> int:
+        """Gets the number of classes for the specified task.
+
+        @type task: str | None
+        @param task: Task to get the number of classes for.
+        @rtype: int
+        @return: Number of classes for the specified label type.
+        @raises ValueError: If the C{task} is not present in the
+            dataset.
+        @raises RuntimeError: If the C{task} was not provided and the
+            dataset contains different number of classes for different
+            label types.
+        """
+        if task is not None:
+            if task not in self._classes:
+                raise ValueError(
+                    f"Task '{task}' is not present in the dataset."
+                )
+            return len(self._classes[task])
+        n_classes = len(list(self._classes.values())[0])
+        for classes in self._classes.values():
+            if len(classes) != n_classes:
+                raise RuntimeError(
+                    "The dataset contains different number of classes for different tasks."
+                    "Please specify the 'task' argument to get the number of classes."
+                )
+        return n_classes
+
+    def n_keypoints(self, task: str | None = None) -> int:
+        """Gets the number of keypoints for the specified task.
+
+        @type task: str | None
+        @param task: Task to get the number of keypoints for.
+        @rtype: int
+        @return: Number of keypoints for the specified label type.
+        @raises ValueError: If the C{task} is not present in the
+            dataset.
+        @raises RuntimeError: If the C{task} was not provided and the
+            dataset contains different number of keypoints for different
+            label types.
+        """
+        if task is not None:
+            if task not in self._n_keypoints:
+                raise ValueError(
+                    f"Task '{task}' is not present in the dataset."
+                )
+            return self._n_keypoints[task]
+        n_keypoints = next(iter(self._n_keypoints.values()))
+        for n in self._n_keypoints.values():
+            if n != n_keypoints:
+                raise RuntimeError(
+                    "The dataset contains different number of keypoints for different tasks."
+                    "Please specify the 'task' argument to get the number of keypoints."
+                )
+        return n_keypoints
+
+    def classes(self, task: str | None = None) -> list[str]:
+        """Gets the class names for the specified task.
+
+        @type task: str | None
+        @param task: Task to get the class names for.
+        @rtype: list[str]
+        @return: List of class names for the specified label type.
+        @raises ValueError: If the C{task} is not present in the
+            dataset.
+        @raises RuntimeError: If the C{task} was not provided and the
+            dataset contains different class names for different label
+            types.
+        """
+        if task is not None:
+            if task not in self._classes:
+                raise ValueError(
+                    f"Task type {task} is not present in the dataset."
+                )
+            return self._classes[task]
+        class_names = list(self._classes.values())[0]
+        for classes in self._classes.values():
+            if classes != class_names:
+                raise RuntimeError(
+                    "The dataset contains different class names for different tasks."
+                )
+        return class_names
+
+    def autogenerate_anchors(
+        self, n_heads: int
+    ) -> tuple[list[list[float]], float]:
+        """Automatically generates anchors for the provided dataset.
+
+        @type n_heads: int
+        @param n_heads: Number of heads to generate anchors for.
+        @rtype: tuple[list[list[float]], float]
+        @return: List of anchors in [-1,6] format and recall of the
+            anchors.
+        @raises RuntimeError: If the dataset loader was not provided
+            during initialization.
+        """
+        if self._loader is None:
+            raise RuntimeError(
+                "Cannot generate anchors without a dataset loader. "
+                "Please provide a dataset loader to the constructor "
+                "or call `set_loader` method."
+            )
+
+        proposed_anchors, recall = anchors_from_dataset(
+            self._loader, n_anchors=n_heads * 3
+        )
+        return proposed_anchors.reshape(-1, 6).tolist(), recall
+
+    @classmethod
+    def from_loader(cls, loader: BaseLoaderTorch) -> "DatasetMetadata":
+        """Creates a L{DatasetMetadata} object from a L{LuxonisDataset}.
+
+        @type dataset: LuxonisDataset
+        @param dataset: Dataset to create the metadata from.
+        @rtype: DatasetMetadata
+        @return: Instance of L{DatasetMetadata} created from the
+            provided dataset.
+        """
+        classes = loader.get_classes()
+        n_keypoints = loader.get_n_keypoints()
+
+        instance = cls(classes=classes, n_keypoints=n_keypoints, loader=loader)
+        return instance
diff --git a/luxonis_train/utils/exceptions.py b/luxonis_train/utils/exceptions.py
new file mode 100644
index 00000000..bab8c1aa
--- /dev/null
+++ b/luxonis_train/utils/exceptions.py
@@ -0,0 +1,12 @@
+class IncompatibleException(Exception):
+    """Raised when two parts of the model are incompatible with each
+    other."""
+
+    @classmethod
+    def from_missing_task(
+        cls, task: str, present_tasks: list[str], class_name: str
+    ):
+        return cls(
+            f"{class_name} requires '{task}' label, but it was not found in "
+            f"the label dictionary. Available labels: {present_tasks}."
+        )
diff --git a/luxonis_train/utils/general.py b/luxonis_train/utils/general.py
index 5ae3b43f..45013807 100644
--- a/luxonis_train/utils/general.py
+++ b/luxonis_train/utils/general.py
@@ -1,272 +1,141 @@
 import logging
 import math
-from copy import deepcopy
-from typing import Generator, TypeVar
+from typing import TypeVar
 
-from pydantic import BaseModel
 from torch import Size, Tensor
-from torch.utils.data import DataLoader
 
-from luxonis_train.utils.boxutils import anchors_from_dataset
-from luxonis_train.utils.loaders import BaseLoaderTorch
 from luxonis_train.utils.types import Packet
 
+logger = logging.getLogger(__name__)
 
-class DatasetMetadata:
-    """Metadata about the dataset."""
 
-    def __init__(
-        self,
-        *,
-        classes: dict[str, list[str]] | None = None,
-        n_keypoints: dict[str, int] | None = None,
-        loader: DataLoader | None = None,
-    ):
-        """An object containing metadata about the dataset. Used to infer the number of
-        classes, number of keypoints, I{etc.} instead of passing them as arguments to
-        the model.
-
-        @type classes: dict[str, list[str]] | None
-        @param classes: Dictionary mapping tasks to lists of class names.
-        @type n_keypoints: dict[str, int] | None
-        @param n_keypoints: Dictionary mapping tasks to the number of keypoints.
-        @type loader: DataLoader | None
-        @param loader: Dataset loader.
-        """
-        self._classes = classes or {}
-        self._n_keypoints = n_keypoints or {}
-        self._loader = loader
+def make_divisible(x: int | float, divisor: int) -> int:
+    """Upward revision the value x to make it evenly divisible by the
+    divisor.
 
-    @property
-    def classes(self) -> dict[str, list[str]]:
-        """Dictionary mapping label types to lists of class names.
+    Equivalent to M{ceil(x / divisor) * divisor}.
 
-        @type: dict[str, list[str]]
-        @raises ValueError: If classes were not provided during initialization.
-        """
-        if self._classes is None:
-            raise ValueError(
-                "Trying to access `classes`, byt they were not"
-                "provided during initialization."
-            )
-        return self._classes
+    @type x: int | float
+    @param x: Value to be revised.
+    @type divisor: int
+    @param divisor: Divisor.
+    @rtype: int
+    @return: Revised value.
+    """
+    return math.ceil(x / divisor) * divisor
 
-    def n_classes(self, task: str | None) -> int:
-        """Gets the number of classes for the specified task.
 
-        @type task: str | None
-        @param task: Task to get the number of classes for.
-        @rtype: int
-        @return: Number of classes for the specified label type.
-        @raises ValueError: If the dataset loader was not provided during
-            initialization.
-        @raises ValueError: If the dataset contains different number of classes for
-            different label types.
-        """
-        if task is not None:
-            if task not in self.classes:
-                raise ValueError(f"Task '{task}' is not present in the dataset.")
-            return len(self.classes[task])
-        n_classes = len(list(self.classes.values())[0])
-        for classes in self.classes.values():
-            if len(classes) != n_classes:
-                raise ValueError(
-                    "The dataset contains different number of classes for different tasks."
-                )
-        return n_classes
+def infer_upscale_factor(
+    in_size: tuple[int, int] | int, orig_size: tuple[int, int] | int
+) -> int:
+    """Infer the upscale factor from the input shape and the original
+    shape.
+
+    @type in_size: tuple[int, int] | int
+    @param in_size: Input shape as a tuple of (height, width) or just
+        one of them.
+    @type orig_size: tuple[int, int] | int
+    @param orig_size: Original shape as a tuple of (height, width) or
+        just one of them.
+    @rtype: int
+    @return: Upscale factor.
+    @raise ValueError: If the C{in_size} cannot be upscaled to the
+        C{orig_size}. This can happen if the upscale factors are not
+        integers or are different.
+    """
 
-    def n_keypoints(self, task: str | None) -> int:
-        if task is not None:
-            if task not in self._n_keypoints:
-                raise ValueError(f"Task '{task}' is not present in the dataset.")
-            return self._n_keypoints[task]
-        if len(self._n_keypoints) > 1:
+    def _infer_upscale_factor(in_size: int, orig_size: int) -> int | float:
+        factor = math.log2(orig_size) - math.log2(in_size)
+        if abs(round(factor) - factor) < 1e-6:
+            return int(round(factor))
+        return factor
+
+    if isinstance(in_size, int):
+        in_size = (in_size, in_size)
+    if isinstance(orig_size, int):
+        orig_size = (orig_size, orig_size)
+    in_height, in_width = in_size
+    orig_height, orig_width = orig_size
+
+    width_factor = _infer_upscale_factor(in_width, orig_width)
+    height_factor = _infer_upscale_factor(in_height, orig_height)
+
+    match (width_factor, height_factor):
+        case (int(wf), int(hf)) if wf == hf:
+            return wf
+        case (int(wf), int(hf)):
             raise ValueError(
-                "The dataset specifies multiple keypoint tasks, "
-                "please specify the 'task' argument to get the number of keypoints."
+                f"Width and height upscale factors are different. "
+                f"Width: {wf}, height: {hf}."
             )
-        return next(iter(self._n_keypoints.values()))
-
-    def class_names(self, task: str | None) -> list[str]:
-        """Gets the class names for the specified task.
-
-        @type task: str | None
-        @param task: Task to get the class names for.
-        @rtype: list[str]
-        @return: List of class names for the specified label type.
-        @raises ValueError: If the dataset loader was not provided during
-            initialization.
-        @raises ValueError: If the dataset contains different class names for different
-            label types.
-        """
-        if task is not None:
-            if task not in self.classes:
-                raise ValueError(f"Task type {task} is not present in the dataset.")
-            return self.classes[task]
-        class_names = list(self.classes.values())[0]
-        for classes in self.classes.values():
-            if classes != class_names:
-                raise ValueError(
-                    "The dataset contains different class names for different tasks."
-                )
-        return class_names
-
-    def autogenerate_anchors(self, n_heads: int) -> tuple[list[list[float]], float]:
-        """Automatically generates anchors for the provided dataset.
-
-        @type n_heads: int
-        @param n_heads: Number of heads to generate anchors for.
-        @rtype: tuple[list[list[float]], float]
-        @return: List of anchors in [-1,6] format and recall of the anchors.
-        @raises ValueError: If the dataset loader was not provided during
-            initialization.
-        """
-        if self.loader is None:
+        case (int(wf), float(hf)):
             raise ValueError(
-                "Cannot generate anchors without a dataset loader. "
-                "Please provide a dataset loader to the constructor "
-                "or call `set_loader` method."
+                f"Width upscale factor is an integer, but height upscale factor is not. "
+                f"Width: {wf}, height: {hf}."
             )
-
-        proposed_anchors, recall = anchors_from_dataset(
-            self.loader, n_anchors=n_heads * 3
-        )
-        return proposed_anchors.reshape(-1, 6).tolist(), recall
-
-    def set_loader(self, loader: DataLoader) -> None:
-        """Sets the dataset loader.
-
-        @type loader: DataLoader
-        @param loader: Dataset loader.
-        """
-        self.loader = loader
-
-    @classmethod
-    def from_loader(cls, loader: BaseLoaderTorch) -> "DatasetMetadata":
-        """Creates a L{DatasetMetadata} object from a L{LuxonisDataset}.
-
-        @type dataset: LuxonisDataset
-        @param dataset: Dataset to create the metadata from.
-        @rtype: DatasetMetadata
-        @return: Instance of L{DatasetMetadata} created from the provided dataset.
-        """
-        classes = loader.get_classes()
-        n_keypoints = loader.get_n_keypoints()
-
-        return cls(classes=classes, n_keypoints=n_keypoints)
-
-
-def make_divisible(x: int | float, divisor: int) -> int:
-    """Upward revision the value x to make it evenly divisible by the divisor."""
-    return math.ceil(x / divisor) * divisor
-
-
-def infer_upscale_factor(
-    in_height: int, orig_height: int, strict: bool = True, warn: bool = True
-) -> int:
-    """Infer the upscale factor from the input height and original height."""
-    num_up = math.log2(orig_height) - math.log2(in_height)
-    if abs(round(num_up) - num_up) < 1e-6:
-        return int(round(num_up))
-    elif not strict:
-        if warn:
-            logging.getLogger(__name__).warning(
-                f"Upscale factor is not an integer: {num_up}. "
-                "Output shape will not be the same as input shape."
+        case (float(wf), int(hf)):
+            raise ValueError(
+                f"Height upscale factor is an integer, but width upscale factor is not. "
+                f"Width: {wf}, height: {hf}."
+            )
+        case (float(wf), float(hf)):
+            raise ValueError(
+                "Width and height upscale factors are not integers. "
+                f"Width: {wf}, height: {hf}."
             )
-        return round(num_up)
-    else:
-        raise ValueError(
-            f"Upscale factor is not an integer: {num_up}. "
-            "Output shape will not be the same as input shape."
-        )
+
+    raise NotImplementedError(
+        f"Unexpected case: {width_factor}, {height_factor}"
+    )
 
 
 def to_shape_packet(packet: Packet[Tensor]) -> Packet[Size]:
+    """Converts a packet of tensors to a packet of shapes. Used for
+    debugging purposes.
+
+    @type packet: Packet[Tensor]
+    @param packet: Packet of tensors.
+    @rtype: Packet[Size]
+    @return: Packet of shapes.
+    """
     shape_packet: Packet[Size] = {}
     for name, value in packet.items():
         shape_packet[name] = [x.shape for x in value]
     return shape_packet
 
 
-def is_acyclic(graph: dict[str, list[str]]) -> bool:
-    """Tests if graph is acyclic.
-
-    @type graph: dict[str, list[str]]
-    @param graph: Graph in a format of a dictionary of predecessors. Keys are node
-        names, values are inputs to the node (list of node names).
-    @rtype: bool
-    @return: True if graph is acyclic, False otherwise.
-    """
-    graph = graph.copy()
-
-    def dfs(node: str, visited: set[str], recursion_stack: set[str]):
-        visited.add(node)
-        recursion_stack.add(node)
-
-        for predecessor in graph.get(node, []):
-            if predecessor in recursion_stack:
-                return True
-            if predecessor not in visited:
-                if dfs(predecessor, visited, recursion_stack):
-                    return True
-
-        recursion_stack.remove(node)
-        return False
-
-    visited: set[str] = set()
-    recursion_stack: set[str] = set()
-
-    for node in graph.keys():
-        if node not in visited:
-            if dfs(node, visited, recursion_stack):
-                return False
-
-    return True
-
-
-def validate_packet(data: Packet[Tensor], protocol: type[BaseModel]) -> Packet[Tensor]:
-    return protocol(**data).model_dump()
-
-
 T = TypeVar("T")
 
 
-# TEST:
-def traverse_graph(
-    graph: dict[str, list[str]], nodes: dict[str, T]
-) -> Generator[tuple[str, T, list[str], list[str]], None, None]:
-    """Traverses the graph in topological order.
-
-    @type graph: dict[str, list[str]]
-    @param graph: Graph in a format of a dictionary of predecessors. Keys are node
-        names, values are inputs to the node (list of node names).
-    @type nodes: dict[str, T]
-    @param nodes: Dictionary mapping node names to node objects.
-    @rtype: Generator[tuple[str, T, list[str], list[str]], None, None]
-    @return: Generator of tuples containing node name, node object, node dependencies
-        and unprocessed nodes.
-    @raises RuntimeError: If the graph is malformed.
+def get_with_default(
+    value: T | None,
+    action_name: str,
+    caller_name: str | None = None,
+    *,
+    default: T,
+) -> T:
+    """Returns value if it is not C{None}, otherwise returns the default
+    value and log an info.
+
+    @type value: T | None
+    @param value: Value to return.
+    @type action_name: str
+    @param action_name: Name of the action for which the default value
+        is being used. Used for logging.
+    @type caller_name: str | None
+    @param caller_name: Name of the caller function. Used for logging.
+    @type default: T
+    @param default: Default value to return if C{value} is C{None}.
+    @rtype: T
+    @return: C{value} if it is not C{None}, otherwise C{default}.
     """
-    unprocessed_nodes = sorted(
-        set(nodes.keys())
-    )  # sort the set to allow reproducibility
-    processed: set[str] = set()
+    if value is not None:
+        return value
 
-    graph = deepcopy(graph)
-    while unprocessed_nodes:
-        unprocessed_nodes_copy = unprocessed_nodes.copy()
-        for node_name in unprocessed_nodes_copy:
-            node_dependencies = graph[node_name]
-            if not node_dependencies or all(
-                dependency in processed for dependency in node_dependencies
-            ):
-                yield node_name, nodes[node_name], node_dependencies, unprocessed_nodes
-                processed.add(node_name)
-                unprocessed_nodes.remove(node_name)
+    msg = f"Default value of {value} is being used for {action_name}."
 
-        if unprocessed_nodes_copy == unprocessed_nodes:
-            raise RuntimeError(
-                "Malformed graph. "
-                "Please check that all nodes are connected in a directed acyclic graph."
-            )
+    if caller_name:
+        msg = f"[{caller_name}] {msg}"
+
+    logger.info(msg, stacklevel=2)
+    return default
diff --git a/luxonis_train/utils/graph.py b/luxonis_train/utils/graph.py
new file mode 100644
index 00000000..a2b72832
--- /dev/null
+++ b/luxonis_train/utils/graph.py
@@ -0,0 +1,92 @@
+from copy import deepcopy
+from typing import Iterator, TypeAlias, TypeVar
+
+Graph: TypeAlias = dict[str, list[str]]
+"""Graph in a format of a dictionary of predecessors.
+
+Keys are node names, values are inputs to the node (list of node names).
+"""
+
+
+def is_acyclic(graph: Graph) -> bool:
+    """Tests if graph is acyclic.
+
+    @type graph: dict[str, list[str]]
+    @param graph: Graph in a format of a dictionary of predecessors.
+        Keys are node names, values are inputs to the node (list of node
+        names).
+    @rtype: bool
+    @return: True if graph is acyclic, False otherwise.
+    """
+    graph = graph.copy()
+
+    def dfs(node: str, visited: set[str], recursion_stack: set[str]):
+        visited.add(node)
+        recursion_stack.add(node)
+
+        for predecessor in graph.get(node, []):
+            if predecessor in recursion_stack:
+                return True
+            if predecessor not in visited:
+                if dfs(predecessor, visited, recursion_stack):
+                    return True
+
+        recursion_stack.remove(node)
+        return False
+
+    visited: set[str] = set()
+    recursion_stack: set[str] = set()
+
+    for node in graph.keys():
+        if node not in visited:
+            if dfs(node, visited, recursion_stack):
+                return False
+
+    return True
+
+
+T = TypeVar("T")
+
+
+def traverse_graph(
+    graph: Graph, nodes: dict[str, T]
+) -> Iterator[tuple[str, T, list[str], list[str]]]:
+    """Traverses the graph in topological order.
+
+    @type graph: dict[str, list[str]]
+    @param graph: Graph in a format of a dictionary of predecessors.
+        Keys are node names, values are inputs to the node (list of node
+        names).
+    @type nodes: dict[str, T]
+    @param nodes: Dictionary mapping node names to node objects.
+    @rtype: Iterator[tuple[str, T, list[str], list[str]]]
+    @return: Iterator of tuples containing node name, node object, node
+        dependencies and unprocessed nodes.
+    @raises RuntimeError: If the graph is malformed.
+    """
+    # sort the set to allow reproducibility
+    unprocessed_nodes = sorted(set(nodes.keys()))
+    processed: set[str] = set()
+
+    graph = deepcopy(graph)
+    while unprocessed_nodes:
+        unprocessed_nodes_copy = unprocessed_nodes.copy()
+        for node_name in unprocessed_nodes_copy:
+            node_dependencies = graph[node_name]
+            if not node_dependencies or all(
+                dependency in processed for dependency in node_dependencies
+            ):
+                unprocessed_nodes.remove(node_name)
+                yield (
+                    node_name,
+                    nodes[node_name],
+                    node_dependencies,
+                    unprocessed_nodes.copy(),
+                )
+                processed.add(node_name)
+
+        if unprocessed_nodes_copy == unprocessed_nodes:
+            raise RuntimeError(
+                "Malformed graph. "
+                "Please check that all nodes are connected in a directed acyclic graph."
+            )
diff --git a/luxonis_train/utils/keypoints.py b/luxonis_train/utils/keypoints.py
new file mode 100644
index 00000000..9fbc741d
--- /dev/null
+++ b/luxonis_train/utils/keypoints.py
@@ -0,0 +1,85 @@
+import logging
+
+import torch
+from torch import Tensor
+
+logger = logging.getLogger(__name__)
+
+
+def process_keypoints_predictions(
+    keypoints: Tensor,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Extracts x, y and visibility from keypoints predictions.
+
+    @type keypoints: Tensor
+    @param keypoints: Keypoints predictions. The last dimension must be divisible by 3
+        and is expected to be in format [x1, y1, v1, x2, y2, v2, ...].
+
+    @rtype: tuple[Tensor, Tensor, Tensor]
+    @return: x, y and visibility tensors.
+    """
+    x = keypoints[..., ::3]
+    y = keypoints[..., 1::3]
+    visibility = keypoints[..., 2::3]
+    return x, y, visibility
+
+
+def get_sigmas(
+    sigmas: list[float] | None,
+    n_keypoints: int,
+    caller_name: str | None = None,
+) -> Tensor:
+    """Validate or create sigma values for each keypoint.
+
+    @type sigmas: list[float] | None
+    @param sigmas: List of sigmas for each keypoint. If C{None}, then
+        default sigmas are used.
+    @type n_keypoints: int
+    @param n_keypoints: Number of keypoints.
+    @type caller_name: str | None
+    @param caller_name: Name of the caller function. Used for logging.
+    @rtype: Tensor
+    @return: Tensor of sigmas.
+    """
+    if sigmas is not None:
+        if len(sigmas) == n_keypoints:
+            return torch.tensor(sigmas, dtype=torch.float32)
+        else:
+            error_msg = "The length of the sigmas list must be the same as the number of keypoints."
+            if caller_name:
+                error_msg = f"[{caller_name}] {error_msg}"
+            raise ValueError(error_msg)
+    else:
+        if n_keypoints == 17:
+            msg = "Default COCO sigmas are being used."
+            if caller_name:
+                msg = f"[{caller_name}] {msg}"
+            logger.warning(msg)
+            return torch.tensor(
+                [
+                    0.026,
+                    0.025,
+                    0.025,
+                    0.035,
+                    0.035,
+                    0.079,
+                    0.079,
+                    0.072,
+                    0.072,
+                    0.062,
+                    0.062,
+                    0.107,
+                    0.107,
+                    0.087,
+                    0.087,
+                    0.089,
+                    0.089,
+                ],
+                dtype=torch.float32,
+            )
+        else:
+            msg = "Default sigma of 0.04 is being used for each keypoint."
+            if caller_name:
+                msg = f"[{caller_name}] {msg}"
+            logger.info(msg)
+            return torch.tensor([0.04] * n_keypoints, dtype=torch.float32)
diff --git a/luxonis_train/utils/registry.py b/luxonis_train/utils/registry.py
index 2222ecbd..02532d32 100644
--- a/luxonis_train/utils/registry.py
+++ b/luxonis_train/utils/registry.py
@@ -1,46 +1,46 @@
-"""This module implements a metaclass for automatic registration of classes."""
+"""This module implements a metaclass for automatic registration of
+classes."""
 
 import lightning.pytorch as pl
-import torch
 from luxonis_ml.utils.registry import Registry
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
 
-import luxonis_train
+import luxonis_train as lt
 
 CALLBACKS: Registry[type[pl.Callback]] = Registry(name="callbacks")
 """Registry for all callbacks."""
 
-LOADERS: Registry[type["luxonis_train.utils.loaders.BaseLoaderTorch"]] = Registry(
+LOADERS: Registry[type["lt.loaders.BaseLoaderTorch"]] = Registry(
     name="loaders"
 )
 """Registry for all loaders."""
 
-LOSSES: Registry[type["luxonis_train.attached_modules.BaseLoss"]] = Registry(
+LOSSES: Registry[type["lt.attached_modules.BaseLoss"]] = Registry(
     name="losses"
 )
 """Registry for all losses."""
 
-METRICS: Registry[type["luxonis_train.attached_modules.BaseMetric"]] = Registry(
+METRICS: Registry[type["lt.attached_modules.BaseMetric"]] = Registry(
     name="metrics"
 )
 """Registry for all metrics."""
 
-MODELS: Registry[type["luxonis_train.models.BasePredefinedModel"]] = Registry(
+MODELS: Registry[type["lt.models.BasePredefinedModel"]] = Registry(
     name="models"
 )
 """Registry for all models."""
 
-NODES: Registry[type["luxonis_train.nodes.BaseNode"]] = Registry(name="nodes")
+NODES: Registry[type["lt.nodes.BaseNode"]] = Registry(name="nodes")
 """Registry for all nodes."""
 
-OPTIMIZERS: Registry[type[torch.optim.Optimizer]] = Registry(name="optimizers")
+OPTIMIZERS: Registry[type[Optimizer]] = Registry(name="optimizers")
 """Registry for all optimizers."""
 
-SCHEDULERS: Registry[type[torch.optim.lr_scheduler._LRScheduler]] = Registry(
-    name="schedulers"
-)
+SCHEDULERS: Registry[type[_LRScheduler]] = Registry(name="schedulers")
 """Registry for all schedulers."""
 
-VISUALIZERS: Registry[type["luxonis_train.visualizers.BaseVisualizer"]] = Registry(
+VISUALIZERS: Registry[type["lt.visualizers.BaseVisualizer"]] = Registry(
     "visualizers"
 )
 """Registry for all visualizers."""
diff --git a/luxonis_train/utils/tracker.py b/luxonis_train/utils/tracker.py
index 4df76edd..35d7af70 100644
--- a/luxonis_train/utils/tracker.py
+++ b/luxonis_train/utils/tracker.py
@@ -1,12 +1,15 @@
+from typing import Any
+
 from lightning.pytorch.loggers.logger import Logger
 from lightning.pytorch.utilities import rank_zero_only  # type: ignore
 from luxonis_ml.tracker import LuxonisTracker
 
 
 class LuxonisTrackerPL(LuxonisTracker, Logger):
-    """Implementation of LuxonisTracker that is compatible with PytorchLightning."""
+    """Implementation of LuxonisTracker that is compatible with
+    PytorchLightning."""
 
-    def __init__(self, *, _auto_finalize: bool = True, **kwargs):
+    def __init__(self, *, _auto_finalize: bool = True, **kwargs: Any):
         """
         @type _auto_finalize: bool
         @param _auto_finalize: If True, the run will be finalized automatically when the training ends.
@@ -21,7 +24,7 @@ def __init__(self, *, _auto_finalize: bool = True, **kwargs):
             self.finalize = self._finalize
 
     @rank_zero_only
-    def _finalize(self, status: str = "success") -> None:
+    def _finalize(self, status: str = "success") -> None:  # pragma: no cover
         """Finalizes current run."""
         if self.is_tensorboard:
             self.experiment["tensorboard"].flush()
diff --git a/luxonis_train/utils/types.py b/luxonis_train/utils/types.py
index 84b8e019..3a7ca7f4 100644
--- a/luxonis_train/utils/types.py
+++ b/luxonis_train/utils/types.py
@@ -1,19 +1,21 @@
-from typing import Annotated, Any, Literal, TypeVar
+from typing import Any, Literal, TypeVar
 
 from luxonis_ml.data import LabelType
-from pydantic import BaseModel, Field, ValidationError
 from torch import Size, Tensor
 
 Kwargs = dict[str, Any]
-OutputTypes = Literal["boundingbox", "class", "keypoints", "segmentation", "features"]
+"""Kwargs is a dictionary containing keyword arguments."""
+
 Labels = dict[str, tuple[Tensor, LabelType]]
+"""Labels is a dictionary containing a tuple of tensors and their
+corresponding label type."""
 
 AttachIndexType = Literal["all"] | int | tuple[int, int] | tuple[int, int, int]
-"""AttachIndexType is used to specify to which output of the prevoius node does the
-current node attach to.
+"""AttachIndexType is used to specify to which output of the prevoius
+node does the current node attach to.
 
-It can be either "all" (all outputs), an index of the output or a tuple of indices of
-the output (specifying a range of outputs).
+It can be either "all" (all outputs), an index of the output or a tuple
+of indices of the output (specifying a range of outputs).
 """
 
 T = TypeVar("T", Tensor, Size)
@@ -22,31 +24,3 @@
 
 It is used to pass data between different nodes of the network graph.
 """
-
-
-class IncompatibleException(Exception):
-    """Raised when two parts of the model are incompatible with each other."""
-
-    @classmethod
-    def from_validation_error(cls, val_error: ValidationError, class_name: str):
-        return cls(
-            f"{class_name} received an input not conforming to the protocol. "
-            f"Validation error: {val_error.errors(include_input=False, include_url=False)}."
-        )
-
-    @classmethod
-    def from_missing_task(cls, task: str, present_tasks: list[str], class_name: str):
-        return cls(
-            f"{class_name} requires '{task}' label, but it was not found in "
-            f"the label dictionary. Available labels: {present_tasks}."
-        )
-
-
-class BaseProtocol(BaseModel):
-    class Config:
-        arbitrary_types_allowed = True
-        extra = "forbid"
-
-
-class FeaturesProtocol(BaseProtocol):
-    features: Annotated[list[Tensor], Field(min_length=1)]
diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg
index 8e21255a..34387324 100644
--- a/media/coverage_badge.svg
+++ b/media/coverage_badge.svg
@@ -9,13 +9,13 @@
     </mask>
     <g mask="url(#a)">
         <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#a4a61d" d="M63 0h36v20H63z"/>
+        <path fill="#4c1" d="M63 0h36v20H63z"/>
         <path fill="url(#b)" d="M0 0h99v20H0z"/>
     </g>
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">84%</text>
-        <text x="80" y="14">84%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">97%</text>
+        <text x="80" y="14">97%</text>
     </g>
 </svg>
diff --git a/pyproject.toml b/pyproject.toml
index 2093e25b..d65978d4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,5 @@
 [project]
 name = "luxonis-train"
-version = "0.0.1"
 description = "Luxonis training framework for seamless training of various neural networks."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -8,7 +7,7 @@ license = { file = "LICENSE" }
 authors = [{ name = "Luxonis", email = "support@luxonis.com" }]
 maintainers = [{ name = "Luxonis", email = "support@luxonis.com" }]
 keywords = ["ml", "training", "luxonis", "oak"]
-dynamic = ["dependencies", "optional-dependencies"]
+dynamic = ["dependencies", "optional-dependencies", "version"]
 classifiers = [
   "License :: OSI Approved :: Apache Software License",
   "Development Status :: 3 - Alpha",
@@ -35,10 +34,11 @@ where = ["."]
 [tool.setuptools.dynamic]
 dependencies = { file = ["requirements.txt"] }
 optional-dependencies = { dev = { file = ["requirements-dev.txt"] } }
+version = {attr = "luxonis_train.__version__"}
 
 [tool.ruff]
 target-version = "py310"
-line-length = 88
+line-length = 79
 indent-width = 4
 
 [tool.ruff.lint]
@@ -47,10 +47,44 @@ select = ["E4", "E7", "E9", "F", "W", "B", "I"]
 
 [tool.docformatter]
 black = true
-
-[tool.mypy]
-python_version = "3.10"
-ignore_missing_imports = true
+wrap-summaries = 72
+wrap-descriptions = 72
 
 [tool.pyright]
 typeCheckingMode = "basic"
+reportMissingTypeStubs = "none"
+reportPrivateImportUsage = "none"
+reportPrivateUsage = "none"
+reportIncompatibleVariableOverride = "none"
+reportIncompatibleMethodOverride = "none"
+reportUnnecessaryIsInstance = "none"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "--disable-warnings"
+markers = [
+    "unit: mark a test as a unit test",
+    "integration: mark a test as an integration test",
+]
+
+[tool.coverage.run]
+omit = [
+    "**/__main__.py",
+    "**/gpu_stats_monitor.py"
+]
+
+[tool.coverage.report]
+exclude_also = [
+    "def __repr__",
+    "def __rich_repr__",
+    "def __str__",
+    "assert",
+    "raise NotImplementedError",
+    "except ImportError",
+    "@abstractmethod",
+    "@overload",
+    "exit\\(\\)",
+    "cv2\\.imshow",
+    "cv2\\.waitKey",
+    "logger\\.",
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 7f915575..e4dbd194 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,3 +4,5 @@ pre-commit>=3.2.1
 opencv-stubs>=0.0.8
 pytest-cov>=4.1.0
 pytest-subtests>=0.12.1
+pytest-md>=0.2.0
+pytest-order>=1.3.0
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/configs/archive_config.yaml b/tests/configs/archive_config.yaml
new file mode 100644
index 00000000..71589f4d
--- /dev/null
+++ b/tests/configs/archive_config.yaml
@@ -0,0 +1,43 @@
+
+model:
+  name: archive_test
+  nodes:
+    - name: EfficientRep
+
+    - name: EfficientBBoxHead
+      inputs:
+        - EfficientRep
+
+    - name: EfficientKeypointBBoxHead
+      inputs:
+        - EfficientRep
+
+    - name: ImplicitKeypointBBoxHead
+      inputs:
+        - EfficientRep
+
+    - name: SegmentationHead
+      inputs:
+        - EfficientRep
+
+    - name: BiSeNetHead
+      inputs:
+        - EfficientRep
+
+    - name: ClassificationHead
+      inputs:
+        - EfficientRep
+
+exporter:
+  output_names:
+    - seg0
+    - class0
+    - bbox0
+    - bbox1
+    - bbox2
+    - effkpt0
+    - effkpt1
+    - effkpt2
+    - impl
+    - seg1
+
diff --git a/tests/configs/parking_lot_config.yaml b/tests/configs/parking_lot_config.yaml
index ae9f8069..bb15ac37 100644
--- a/tests/configs/parking_lot_config.yaml
+++ b/tests/configs/parking_lot_config.yaml
@@ -3,54 +3,26 @@ model:
   name: parking_lot_model
   nodes:
 
-    - name: ReXNetV1_lite
-      alias: rexnet-detection-backbone
-
     - name: EfficientRep
-      alias: efficient-detection-backbone
-      params:
-        channels_list: [64, 128, 256, 512, 1024]
-        num_repeats: [1, 6, 12, 18, 6]
-        depth_mul: 0.33
-        width_mul: 0.33
+      alias: backbone
 
     - name: RepPANNeck
-      alias: efficient-detection-neck
+      alias: neck
       inputs:
-        - efficient-detection-backbone
-      params:
-        channels_list: [256, 128, 128, 256, 256, 512]
-        num_repeats: [12, 12, 12, 12]
-        depth_mul: 0.33
-        width_mul: 0.33
-
-    - name: MicroNet
-      alias: color-segmentation-backbone
-
-    - name: MobileOne
-      alias: brand-segmentation-backbone
-
-    - name: MobileNetV2
-      alias: vehicle-type-segmentation-backbone
-
-    - name: ContextSpatial
-      alias: context-brand-segmentation-backbone
+        - backbone
 
     - name: EfficientBBoxHead
       alias: bbox-head
       inputs:
-        - efficient-detection-neck
+        - neck
 
     - name: ImplicitKeypointBBoxHead
       alias: car-detection-head
       inputs:
-        - rexnet-detection-backbone
+        - neck
       task:
         keypoints: car-keypoints
         boundingbox: car-boundingbox
-      params:
-        conf_thres: 0.25
-        iou_thres: 0.45
 
     - name: EfficientKeypointBBoxHead
       alias: motorbike-detection-head
@@ -58,40 +30,31 @@ model:
         keypoints: motorbike-keypoints
         boundingbox: motorbike-boundingbox
       inputs:
-        - efficient-detection-neck
-      params:
-        conf_thres: 0.25
-        iou_thres: 0.45
-
-    - name: BiSeNetHead
-      alias: context-brand-segmentation-head
-      task: brand_segmentation
-      inputs:
-        - context-brand-segmentation-backbone
+        - neck
 
     - name: SegmentationHead
       alias: color-segmentation-head
-      task: color_segmentation
+      task: color-segmentation
       inputs:
-        - color-segmentation-backbone
+        - neck
 
     - name: SegmentationHead
       alias: any-vehicle-segmentation-head
-      task: vehicle_segmentation
+      task: vehicle-segmentation
       inputs:
-        - vehicle-type-segmentation-backbone
+        - neck
 
     - name: BiSeNetHead
       alias: brand-segmentation-head
-      task: brand_segmentation
+      task: brand-segmentation
       inputs:
-        - brand-segmentation-backbone
+        - neck
 
     - name: BiSeNetHead
       alias: vehicle-type-segmentation-head
-      task: vehicle_type_segmentation
+      task: vehicle_type-segmentation
       inputs:
-        - vehicle-type-segmentation-backbone
+        - neck
 
   losses:
     - name: AdaptiveDetectionLoss
@@ -100,12 +63,8 @@ model:
       attached_to: any-vehicle-segmentation-head
     - name: CrossEntropyLoss
       attached_to: vehicle-type-segmentation-head
-    - name: CrossEntropyLoss
-      attached_to: context-brand-segmentation-head
     - name: CrossEntropyLoss
       attached_to: color-segmentation-head
-    - name: SoftmaxFocalLoss
-      attached_to: brand-segmentation-head
     - name: ImplicitKeypointBBoxLoss
       attached_to: car-detection-head
     - name: EfficientKeypointBBoxLoss
@@ -127,8 +86,6 @@ model:
       attached_to: vehicle-type-segmentation-head
     - name: Precision
       attached_to: brand-segmentation-head
-    - name: Recall
-      attached_to: context-brand-segmentation-head
 
   visualizers:
     - name: MultiVisualizer
@@ -160,9 +117,6 @@ model:
     - name: SegmentationVisualizer
       alias: vehicle-segmentation-visualizer
       attached_to: any-vehicle-segmentation-head
-    - name: SegmentationVisualizer
-      alias: context-brand-segmentation-visualizer
-      attached_to: context-brand-segmentation-head
     - name: SegmentationVisualizer
       alias: brand-segmentation-visualizer
       attached_to: brand-segmentation-head
@@ -184,16 +138,16 @@ trainer:
   devices: auto
   strategy: auto
 
-  num_sanity_val_steps: 1
+  n_sanity_val_steps: 1
   profiler: null
   verbose: True
   batch_size: 2
   accumulate_grad_batches: 1
   epochs: 200
-  num_workers: 8
+  n_workers: 8
   train_metrics_interval: -1
   validation_interval: 10
-  num_log_images: 8
+  n_log_images: 8
   skip_last_batch: True
   log_sub_losses: True
   save_top_k: 3
@@ -214,6 +168,5 @@ trainer:
 
   callbacks:
     - name: ExportOnTrainEnd
-    - name: TestOnTrainEnd
     - name: ArchiveOnTrainEnd
 
diff --git a/tests/configs/segmentation_parse_loader.yaml b/tests/configs/segmentation_parse_loader.yaml
index 60f7a30d..14814571 100644
--- a/tests/configs/segmentation_parse_loader.yaml
+++ b/tests/configs/segmentation_parse_loader.yaml
@@ -22,6 +22,6 @@ trainer:
 
   batch_size: 4
   epochs: &epochs 1
-  num_workers: 4
+  n_workers: 4
   validation_interval: 1
-  num_log_images: 8
+  n_log_images: 8
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..4a8a492c
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,18 @@
+import pytest
+
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        if "/unittests/" in str(item.fspath):
+            item.add_marker(pytest.mark.unit)
+            # ensure unittests run before integration tests
+            item.add_marker(pytest.mark.order(0))
+        elif "/integration/" in str(item.fspath):
+            item.add_marker(pytest.mark.integration)
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "unit: mark test as a unit test")
+    config.addinivalue_line(
+        "markers", "integration: mark test as an integration test"
+    )
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 9b24271b..ef5a2142 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -1,9 +1,14 @@
 import json
+import multiprocessing as mp
+import os
+import shutil
 from collections import defaultdict
 from pathlib import Path
+from typing import Any
 
 import cv2
 import gdown
+import numpy as np
 import pytest
 import torchvision
 from luxonis_ml.data import LuxonisDataset
@@ -12,15 +17,34 @@
 from luxonis_ml.utils import LuxonisFileSystem, environ
 
 WORK_DIR = Path("tests", "data")
-WORK_DIR.mkdir(parents=True, exist_ok=True)
 
-environ.LUXONISML_BASE_PATH = WORK_DIR / "luxonisml"
+
+@pytest.fixture(scope="session")
+def test_output_dir() -> Path:
+    return Path("tests/integration/save-directory")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup(test_output_dir: Path):
+    WORK_DIR.mkdir(parents=True, exist_ok=True)
+    shutil.rmtree(WORK_DIR / "luxonisml", ignore_errors=True)
+    shutil.rmtree(test_output_dir, ignore_errors=True)
+    environ.LUXONISML_BASE_PATH = WORK_DIR / "luxonisml"
+    test_output_dir.mkdir(exist_ok=True)
 
 
 @pytest.fixture
+def train_overfit() -> bool:
+    return bool(os.getenv("LUXONIS_TRAIN_OVERFIT"))
+
+
+@pytest.fixture(scope="session")
 def parking_lot_dataset() -> LuxonisDataset:
     url = "gs://luxonis-test-bucket/luxonis-ml-test-data/D1_ParkingSlotTest"
-    base_path = LuxonisFileSystem.download(url, WORK_DIR)
+    base_path = WORK_DIR / "D1_ParkingSlotTest"
+    if not base_path.exists():
+        base_path = LuxonisFileSystem.download(url, WORK_DIR)
+
     mask_brand_path = base_path / "mask_brand"
     mask_color_path = base_path / "mask_color"
     kpt_mask_path = base_path / "keypoints_mask_vehicle"
@@ -28,7 +52,7 @@ def parking_lot_dataset() -> LuxonisDataset:
     def generator():
         filenames: dict[int, Path] = {}
         for base_path in [kpt_mask_path, mask_brand_path, mask_color_path]:
-            for sequence_path in list(sorted(base_path.glob("sequence.*"))):
+            for sequence_path in sorted(list(base_path.glob("sequence.*"))):
                 frame_data = sequence_path / "step0.frame_data.json"
                 with open(frame_data) as f:
                     data = json.load(f)["captures"][0]
@@ -52,7 +76,9 @@ def generator():
                 for bbox_annotation in annotations.get(
                     "BoundingBox2DAnnotation", defaultdict(list)
                 )["values"]:
-                    class_ = bbox_annotation["labelName"].split("-")[-1].lower()
+                    class_ = (
+                        bbox_annotation["labelName"].split("-")[-1].lower()
+                    )
                     if class_ == "motorbiek":
                         class_ = "motorbike"
                     x, y = bbox_annotation["origin"]
@@ -113,7 +139,10 @@ def generator():
                 ]
                 mask = cv2.cvtColor(
                     cv2.imread(
-                        str(sequence_path / vehicle_type_segmentation["filename"])
+                        str(
+                            sequence_path
+                            / vehicle_type_segmentation["filename"]
+                        )
                     ),
                     cv2.COLOR_BGR2RGB,
                 )
@@ -122,11 +151,11 @@ def generator():
                     for inst in vehicle_type_segmentation["instances"]
                 }
                 if base_path == kpt_mask_path:
-                    task = "vehicle_type_segmentation"
+                    task = "vehicle_type-segmentation"
                 elif base_path == mask_brand_path:
-                    task = "brand_segmentation"
+                    task = "brand-segmentation"
                 else:
-                    task = "color_segmentation"
+                    task = "color-segmentation"
                 for class_, mask_ in rgb_to_bool_masks(
                     mask, classes, add_background_class=True
                 ):
@@ -145,36 +174,40 @@ def generator():
                         "annotation": {
                             "type": "mask",
                             "class": "vehicle",
-                            "task": "vehicle_segmentation",
+                            "task": "vehicle-segmentation",
                             "mask": mask.astype(bool)[..., 0]
                             | mask.astype(bool)[..., 1]
                             | mask.astype(bool)[..., 2],
                         },
                     }
 
-    dataset = LuxonisDataset("__D1ParkingSLot-test", delete_existing=True)
+    dataset = LuxonisDataset("_ParkingLot", delete_existing=True)
     dataset.add(generator())
+    np.random.seed(42)
     dataset.make_splits()
     return dataset
 
 
-@pytest.fixture(scope="session", autouse=True)
-def create_coco_dataset():
+@pytest.fixture(scope="session")
+def coco_dataset() -> LuxonisDataset:
     dataset_name = "coco_test"
     url = "https://drive.google.com/uc?id=1XlvFK7aRmt8op6-hHkWVKIJQeDtOwoRT"
     output_zip = WORK_DIR / "COCO_people_subset.zip"
 
-    if not output_zip.exists() and not (WORK_DIR / "COCO_people_subset").exists():
+    if (
+        not output_zip.exists()
+        and not (WORK_DIR / "COCO_people_subset").exists()
+    ):
         gdown.download(url, str(output_zip), quiet=False)
 
     parser = LuxonisParser(
         str(output_zip), dataset_name=dataset_name, delete_existing=True
     )
-    parser.parse(random_split=True)
+    return parser.parse(random_split=True)
 
 
-@pytest.fixture(scope="session", autouse=True)
-def create_cifar10_dataset():
+@pytest.fixture(scope="session")
+def cifar10_dataset() -> LuxonisDataset:
     dataset = LuxonisDataset("cifar10_test", delete_existing=True)
     output_folder = WORK_DIR / "cifar10"
     output_folder.mkdir(parents=True, exist_ok=True)
@@ -210,3 +243,40 @@ def CIFAR10_subset_generator():
 
     dataset.add(CIFAR10_subset_generator())
     dataset.make_splits()
+    return dataset
+
+
+@pytest.fixture
+def config(train_overfit: bool) -> dict[str, Any]:
+    if train_overfit:
+        epochs = 100
+    else:
+        epochs = 1
+
+    return {
+        "tracker": {
+            "save_directory": "tests/integration/save-directory",
+        },
+        "loader": {
+            "train_view": "val",
+            "params": {
+                "dataset_name": "_ParkingLot",
+            },
+        },
+        "trainer": {
+            "batch_size": 4,
+            "epochs": epochs,
+            "n_workers": mp.cpu_count(),
+            "validation_interval": epochs,
+            "save_top_k": 0,
+            "preprocessing": {
+                "train_image_size": [256, 320],
+                "keep_aspect_ratio": False,
+                "normalize": {"active": True},
+            },
+            "callbacks": [
+                {"name": "ExportOnTrainEnd"},
+            ],
+            "matmul_precision": "medium",
+        },
+    }
diff --git a/tests/integration/multi_input_modules.py b/tests/integration/multi_input_modules.py
index dbc5a449..e6fd0476 100644
--- a/tests/integration/multi_input_modules.py
+++ b/tests/integration/multi_input_modules.py
@@ -1,9 +1,10 @@
 import torch
+from luxonis_ml.data import LabelType
 from torch import Tensor, nn
 
+from luxonis_train.loaders import BaseLoaderTorch
 from luxonis_train.nodes import BaseNode
-from luxonis_train.utils.loaders import BaseLoaderTorch
-from luxonis_train.utils.types import FeaturesProtocol, LabelType, Packet
+from luxonis_train.utils import Packet
 
 
 class CustomMultiInputLoader(BaseLoaderTorch):
@@ -60,29 +61,23 @@ def unwrap(self, inputs: list[dict[str, list[Tensor]]]):
         return [item for inp in inputs for key in inp for item in inp[key]]
 
 
-class FullBackbone(MultiInputTestBaseNode):
-    input_protocols = [FeaturesProtocol] * 4
+class FullBackbone(MultiInputTestBaseNode): ...
 
 
-class RGBDBackbone(MultiInputTestBaseNode):
-    input_protocols = [FeaturesProtocol] * 3
+class RGBDBackbone(MultiInputTestBaseNode): ...
 
 
-class PointcloudBackbone(MultiInputTestBaseNode):
-    input_protocols = [FeaturesProtocol]
+class PointcloudBackbone(MultiInputTestBaseNode): ...
 
 
-class FusionNeck(MultiInputTestBaseNode):
-    input_protocols = [FeaturesProtocol] * 3
+class FusionNeck(MultiInputTestBaseNode): ...
 
 
-class FusionNeck2(MultiInputTestBaseNode):
-    input_protocols = [FeaturesProtocol] * 3
+class FusionNeck2(MultiInputTestBaseNode): ...
 
 
 class CustomSegHead1(MultiInputTestBaseNode):
     tasks = {LabelType.SEGMENTATION: "segmentation"}
-    input_protocols = [FeaturesProtocol]
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -98,7 +93,6 @@ def forward(self, inputs: Tensor):
 
 class CustomSegHead2(MultiInputTestBaseNode):
     tasks = {LabelType.SEGMENTATION: "segmentation"}
-    input_protocols = [FeaturesProtocol] * 3
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
diff --git a/tests/integration/parking_lot.json b/tests/integration/parking_lot.json
index d9599642..0059241e 100644
--- a/tests/integration/parking_lot.json
+++ b/tests/integration/parking_lot.json
@@ -36,7 +36,7 @@
         ],
         "outputs": [
             {
-                "name": "any-vehicle-segmentation-head/vehicle_segmentation/0",
+                "name": "any-vehicle-segmentation-head/vehicle-segmentation/0",
                 "dtype": "float32",
                 "shape": [
                     1,
@@ -80,7 +80,7 @@
                 "layout": "NCHW"
             },
             {
-                "name": "brand-segmentation-head/brand_segmentation/0",
+                "name": "brand-segmentation-head/brand-segmentation/0",
                 "dtype": "float32",
                 "shape": [
                     1,
@@ -95,13 +95,13 @@
                 "dtype": "float32",
                 "shape": [
                     1,
-                    66240,
+                    5040,
                     24
                 ],
                 "layout": "NCD"
             },
             {
-                "name": "color-segmentation-head/color_segmentation/0",
+                "name": "color-segmentation-head/color-segmentation/0",
                 "dtype": "float32",
                 "shape": [
                     1,
@@ -111,17 +111,6 @@
                 ],
                 "layout": "NCHW"
             },
-            {
-                "name": "context-brand-segmentation-head/brand_segmentation/0",
-                "dtype": "float32",
-                "shape": [
-                    1,
-                    23,
-                    256,
-                    320
-                ],
-                "layout": "NCHW"
-            },
             {
                 "name": "motorbike-detection-head/outputs/0",
                 "dtype": "float32",
@@ -156,7 +145,7 @@
                 "layout": "NCDE"
             },
             {
-                "name": "vehicle-type-segmentation-head/vehicle_type_segmentation/0",
+                "name": "vehicle-type-segmentation-head/vehicle_type-segmentation/0",
                 "dtype": "float32",
                 "shape": [
                     1,
@@ -227,42 +216,6 @@
                     "motorbike-detection-head/outputs/2"
                 ]
             },
-            {
-                "parser": "SegmentationParser",
-                "metadata": {
-                    "postprocessor_path": null,
-                    "classes": [
-                        "background",
-                        "chrysler",
-                        "bmw",
-                        "ducati",
-                        "dodge",
-                        "ferrari",
-                        "infiniti",
-                        "land-rover",
-                        "roll-royce",
-                        "saab",
-                        "Kawasaki",
-                        "moto",
-                        "truimph",
-                        "alfa-romeo",
-                        "harley",
-                        "honda",
-                        "jeep",
-                        "aprilia",
-                        "piaggio",
-                        "yamaha",
-                        "buick",
-                        "pontiac",
-                        "isuzu"
-                    ],
-                    "n_classes": 23,
-                    "is_softmax": false
-                },
-                "outputs": [
-                    "context-brand-segmentation-head/brand_segmentation/0"
-                ]
-            },
             {
                 "parser": "SegmentationParser",
                 "metadata": {
@@ -277,7 +230,7 @@
                     "is_softmax": false
                 },
                 "outputs": [
-                    "color-segmentation-head/color_segmentation/0"
+                    "color-segmentation-head/color-segmentation/0"
                 ]
             },
             {
@@ -291,7 +244,7 @@
                     "is_softmax": false
                 },
                 "outputs": [
-                    "any-vehicle-segmentation-head/vehicle_segmentation/0"
+                    "any-vehicle-segmentation-head/vehicle-segmentation/0"
                 ]
             },
             {
@@ -327,7 +280,7 @@
                     "is_softmax": false
                 },
                 "outputs": [
-                    "brand-segmentation-head/brand_segmentation/0"
+                    "brand-segmentation-head/brand-segmentation/0"
                 ]
             },
             {
@@ -343,7 +296,7 @@
                     "is_softmax": false
                 },
                 "outputs": [
-                    "vehicle-type-segmentation-head/vehicle_type_segmentation/0"
+                    "vehicle-type-segmentation-head/vehicle_type-segmentation/0"
                 ]
             }
         ]
diff --git a/tests/integration/test_detection.py b/tests/integration/test_detection.py
new file mode 100644
index 00000000..fb184b6f
--- /dev/null
+++ b/tests/integration/test_detection.py
@@ -0,0 +1,95 @@
+from typing import Any
+
+import pytest
+from luxonis_ml.data import LuxonisDataset
+
+from luxonis_train.core import LuxonisModel
+from luxonis_train.nodes.backbones import __all__ as BACKBONES
+
+
+def get_opts(backbone: str) -> dict[str, Any]:
+    return {
+        "model": {
+            "nodes": [
+                {
+                    "name": backbone,
+                },
+                {
+                    "name": "EfficientBBoxHead",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "EfficientKeypointBBoxHead",
+                    "task": {
+                        "keypoints": "car-keypoints",
+                        "boundingbox": "car-boundingbox",
+                    },
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "ImplicitKeypointBBoxHead",
+                    "task": {
+                        "keypoints": "car-keypoints",
+                        "boundingbox": "car-boundingbox",
+                    },
+                    "inputs": [backbone],
+                },
+            ],
+            "losses": [
+                {
+                    "name": "AdaptiveDetectionLoss",
+                    "attached_to": "EfficientBBoxHead",
+                },
+                {
+                    "name": "EfficientKeypointBBoxLoss",
+                    "attached_to": "EfficientKeypointBBoxHead",
+                    "params": {"area_factor": 0.5},
+                },
+                {
+                    "name": "ImplicitKeypointBBoxLoss",
+                    "attached_to": "ImplicitKeypointBBoxHead",
+                },
+            ],
+            "metrics": [
+                {
+                    "name": "MeanAveragePrecision",
+                    "attached_to": "EfficientBBoxHead",
+                },
+                {
+                    "name": "MeanAveragePrecisionKeypoints",
+                    "alias": "EfficientKeypointBBoxHead-MaP",
+                    "attached_to": "EfficientKeypointBBoxHead",
+                },
+                {
+                    "name": "MeanAveragePrecisionKeypoints",
+                    "alias": "ImplicitKeypointBBoxHead-MaP",
+                    "attached_to": "ImplicitKeypointBBoxHead",
+                },
+            ],
+        }
+    }
+
+
+def train_and_test(
+    config: dict[str, Any],
+    opts: dict[str, Any],
+    train_overfit: bool = False,
+):
+    model = LuxonisModel(config, opts)
+    model.train()
+    results = model.test(view="val")
+    if train_overfit:
+        for name, value in results.items():
+            if "/map_50" in name or "/kpt_map_medium" in name:
+                assert value > 0.8, f"{name} = {value} (expected > 0.8)"
+
+
+@pytest.mark.parametrize("backbone", BACKBONES)
+def test_backbones(
+    backbone: str,
+    config: dict[str, Any],
+    parking_lot_dataset: LuxonisDataset,
+):
+    opts = get_opts(backbone)
+    opts["loader.params.dataset_name"] = parking_lot_dataset.identifier
+    train_and_test(config, opts)
diff --git a/tests/integration/test_sanity.py b/tests/integration/test_sanity.py
deleted file mode 100644
index 5afa385b..00000000
--- a/tests/integration/test_sanity.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import json
-import shutil
-import sys
-import tarfile
-from copy import deepcopy
-from pathlib import Path
-
-import pytest
-from luxonis_ml.data import LuxonisDataset
-from multi_input_modules import *
-
-from luxonis_train.core import LuxonisModel
-
-TEST_OUTPUT = Path("tests/integration/_test-output")
-INFER_PATH = Path("tests/integration/_infer_save_dir")
-ONNX_PATH = Path("tests/integration/_model.onnx")
-STUDY_PATH = Path("study_local.db")
-
-OPTS = {
-    "trainer.epochs": 1,
-    "trainer.batch_size": 1,
-    "trainer.validation_interval": 1,
-    "trainer.callbacks": "[]",
-    "tracker.save_directory": str(TEST_OUTPUT),
-    "tuner.n_trials": 4,
-}
-
-
-@pytest.fixture(scope="session", autouse=True)
-def manage_out_dir():
-    shutil.rmtree(TEST_OUTPUT, ignore_errors=True)
-    TEST_OUTPUT.mkdir(exist_ok=True)
-
-
-@pytest.fixture(scope="function", autouse=True)
-def clear_files():
-    yield
-    STUDY_PATH.unlink(missing_ok=True)
-    ONNX_PATH.unlink(missing_ok=True)
-    shutil.rmtree(INFER_PATH, ignore_errors=True)
-
-
-@pytest.mark.parametrize(
-    "config_file",
-    [
-        "classification_model",
-        "segmentation_model",
-        "detection_model",
-        "keypoint_bbox_model",
-        "resnet_model",
-        "coco_model",
-        "efficient_coco_model",
-    ],
-)
-def test_simple_models(config_file: str):
-    config_file = f"configs/{config_file}.yaml"
-    model = LuxonisModel(config_file, opts=OPTS)
-    model.train()
-    model.test()
-    model.export()
-    assert (
-        Path(model.run_save_dir, "export", model.cfg.model.name)
-        .with_suffix(".onnx")
-        .exists()
-    )
-    model.archive()
-    assert (
-        Path(
-            model.run_save_dir,
-            "archive",
-            model.cfg.archiver.name or model.cfg.model.name,
-        )
-        .with_suffix(".onnx.tar.xz")
-        .exists()
-    )
-    del model
-
-
-def test_multi_input():
-    config_file = "configs/example_multi_input.yaml"
-    model = LuxonisModel(config_file, opts=OPTS)
-    model.train()
-    model.test(view="val")
-
-    assert not ONNX_PATH.exists()
-    model.export(str(ONNX_PATH))
-    assert ONNX_PATH.exists()
-
-    assert not INFER_PATH.exists()
-    model.infer(view="val", save_dir=INFER_PATH)
-    assert INFER_PATH.exists()
-    del model
-
-
-def test_custom_tasks(parking_lot_dataset: LuxonisDataset, subtests):
-    config_file = "tests/configs/parking_lot_config.yaml"
-    opts = deepcopy(OPTS) | {
-        "loader.params.dataset_name": parking_lot_dataset.dataset_name,
-        "trainer.batch_size": 2,
-    }
-    del opts["trainer.callbacks"]
-    model = LuxonisModel(config_file, opts=opts)
-    model.train()
-    archive_path = Path(
-        model.run_save_dir, "archive", model.cfg.model.name
-    ).with_suffix(".onnx.tar.xz")
-    correct_archive_config = json.loads(
-        Path("tests/integration/parking_lot.json").read_text()
-    )
-
-    with subtests.test("test_archive"):
-        assert archive_path.exists()
-        with tarfile.open(archive_path) as tar:
-            extracted_cfg = tar.extractfile("config.json")
-
-            assert extracted_cfg is not None, "Config JSON not found in the archive."
-            generated_config = json.loads(extracted_cfg.read().decode())
-
-        del generated_config["model"]["heads"][1]["metadata"]["anchors"]
-        assert generated_config == correct_archive_config
-
-    del model
-
-
-def test_parsing_loader():
-    model = LuxonisModel("tests/configs/segmentation_parse_loader.yaml")
-    model.train()
-    del model
-
-
-@pytest.mark.skipif(sys.platform == "win32", reason="Tuning not supported on Windows")
-def test_tuner():
-    model = LuxonisModel("configs/example_tuning.yaml", opts=OPTS)
-    model.tune()
-    assert STUDY_PATH.exists()
-    del model
diff --git a/tests/integration/test_segmentation.py b/tests/integration/test_segmentation.py
new file mode 100644
index 00000000..c24e6fb9
--- /dev/null
+++ b/tests/integration/test_segmentation.py
@@ -0,0 +1,134 @@
+from typing import Any
+
+import pytest
+from luxonis_ml.data import LuxonisDataset
+
+from luxonis_train.core import LuxonisModel
+from luxonis_train.nodes.backbones import __all__ as BACKBONES
+
+
+def get_opts(backbone: str) -> dict[str, Any]:
+    opts = {
+        "model": {
+            "nodes": [
+                {
+                    "name": backbone,
+                },
+                {
+                    "name": "SegmentationHead",
+                    "alias": "seg-color-segmentation",
+                    "task": "color-segmentation",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "BiSeNetHead",
+                    "alias": "bi-color-segmentation",
+                    "task": "color-segmentation",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "SegmentationHead",
+                    "alias": "seg-vehicle-segmentation",
+                    "task": "vehicle-segmentation",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "BiSeNetHead",
+                    "alias": "bi-vehicle-segmentation",
+                    "task": "vehicle-segmentation",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "SegmentationHead",
+                    "alias": "seg-vehicle-segmentation-2",
+                    "task": "vehicle-segmentation",
+                    "inputs": [backbone],
+                },
+                {
+                    "name": "SegmentationHead",
+                    "alias": "seg-vehicle-segmentation-3",
+                    "task": "vehicle-segmentation",
+                    "inputs": [backbone],
+                },
+            ],
+            "losses": [
+                {
+                    "name": "CrossEntropyLoss",
+                    "attached_to": "seg-color-segmentation",
+                },
+                {
+                    "name": "CrossEntropyLoss",
+                    "attached_to": "bi-color-segmentation",
+                },
+                {
+                    "name": "BCEWithLogitsLoss",
+                    "attached_to": "seg-vehicle-segmentation",
+                },
+                {
+                    "name": "SigmoidFocalLoss",
+                    "attached_to": "bi-vehicle-segmentation",
+                    "params": {"alpha": 0.5, "gamma": 1.0},
+                },
+                {
+                    "name": "SoftmaxFocalLoss",
+                    "attached_to": "seg-vehicle-segmentation-2",
+                    "params": {"alpha": 0.5, "gamma": 1.0},
+                },
+                {
+                    "name": "SmoothBCEWithLogitsLoss",
+                    "attached_to": "seg-vehicle-segmentation-3",
+                    "params": {"label_smoothing": 0.1},
+                },
+            ],
+            "metrics": [],
+            "visualizers": [],
+        }
+    }
+    aliases = [head["alias"] for head in opts["model"]["nodes"][1:]]
+    for alias in aliases:
+        opts["model"]["metrics"].extend(
+            [
+                {
+                    "name": "JaccardIndex",
+                    "alias": f"JaccardIndex_{alias}",
+                    "attached_to": alias,
+                },
+                {
+                    "name": "F1Score",
+                    "alias": f"F1Score_{alias}",
+                    "attached_to": alias,
+                },
+            ]
+        )
+        opts["model"]["visualizers"].append(
+            {
+                "name": "SegmentationVisualizer",
+                "attached_to": alias,
+            }
+        )
+    return opts
+
+
+def train_and_test(
+    config: dict[str, Any],
+    opts: dict[str, Any],
+    train_overfit: bool = False,
+):
+    model = LuxonisModel(config, opts)
+    model.train()
+    results = model.test(view="val")
+    if train_overfit:
+        for name, value in results.items():
+            if "metric" in name:
+                assert value > 0.8, f"{name} = {value} (expected > 0.8)"
+
+
+@pytest.mark.parametrize("backbone", BACKBONES)
+def test_backbones(
+    backbone: str,
+    config: dict[str, Any],
+    parking_lot_dataset: LuxonisDataset,
+):
+    opts = get_opts(backbone)
+    opts["loader.params.dataset_name"] = parking_lot_dataset.identifier
+    train_and_test(config, opts)
diff --git a/tests/integration/test_simple.py b/tests/integration/test_simple.py
new file mode 100644
index 00000000..784db01a
--- /dev/null
+++ b/tests/integration/test_simple.py
@@ -0,0 +1,215 @@
+import json
+import shutil
+import sys
+import tarfile
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+
+import pytest
+from luxonis_ml.data import LuxonisDataset
+from luxonis_ml.utils import environ
+
+from luxonis_train.core import LuxonisModel
+
+from .multi_input_modules import *
+
+INFER_PATH = Path("tests/integration/infer-save-directory")
+ONNX_PATH = Path("tests/integration/_model.onnx")
+STUDY_PATH = Path("study_local.db")
+
+
+@pytest.fixture
+def opts(test_output_dir: Path) -> dict[str, Any]:
+    return {
+        "trainer.epochs": 1,
+        "trainer.batch_size": 1,
+        "trainer.validation_interval": 1,
+        "trainer.callbacks": "[]",
+        "tracker.save_directory": str(test_output_dir),
+        "tuner.n_trials": 4,
+    }
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clear_files():
+    # todo
+    yield
+    STUDY_PATH.unlink(missing_ok=True)
+    ONNX_PATH.unlink(missing_ok=True)
+    shutil.rmtree(INFER_PATH, ignore_errors=True)
+
+
+@pytest.mark.parametrize(
+    "config_file",
+    [
+        "classification_model",
+        "segmentation_model",
+        "detection_model",
+        "keypoint_bbox_model",
+    ],
+)
+def test_predefined_models(
+    opts: dict[str, Any],
+    config_file: str,
+    coco_dataset: LuxonisDataset,
+    cifar10_dataset: LuxonisDataset,
+):
+    config_file = f"configs/{config_file}.yaml"
+    opts |= {
+        "loader.params.dataset_name": cifar10_dataset.dataset_name
+        if "classification_model" in config_file
+        else coco_dataset.dataset_name,
+    }
+    model = LuxonisModel(config_file, opts)
+    model.train()
+    model.test()
+
+
+def test_multi_input(opts: dict[str, Any]):
+    config_file = "configs/example_multi_input.yaml"
+    model = LuxonisModel(config_file, opts)
+    model.train()
+    model.test(view="val")
+
+    assert not ONNX_PATH.exists()
+    model.export(str(ONNX_PATH))
+    assert ONNX_PATH.exists()
+
+    assert not INFER_PATH.exists()
+    model.infer(view="val", save_dir=INFER_PATH)
+    assert INFER_PATH.exists()
+
+
+def test_custom_tasks(
+    opts: dict[str, Any], parking_lot_dataset: LuxonisDataset, subtests
+):
+    config_file = "tests/configs/parking_lot_config.yaml"
+    opts |= {
+        "loader.params.dataset_name": parking_lot_dataset.dataset_name,
+        "trainer.batch_size": 2,
+    }
+    del opts["trainer.callbacks"]
+    model = LuxonisModel(config_file, opts)
+    model.train()
+    archive_path = Path(
+        model.run_save_dir, "archive", model.cfg.model.name
+    ).with_suffix(".onnx.tar.xz")
+    correct_archive_config = json.loads(
+        Path("tests/integration/parking_lot.json").read_text()
+    )
+
+    with subtests.test("test_archive"):
+        assert archive_path.exists()
+        with tarfile.open(archive_path) as tar:
+            extracted_cfg = tar.extractfile("config.json")
+
+            assert (
+                extracted_cfg is not None
+            ), "Config JSON not found in the archive."
+            generated_config = json.loads(extracted_cfg.read().decode())
+
+        del generated_config["model"]["heads"][1]["metadata"]["anchors"]
+        assert generated_config == correct_archive_config
+
+
+@pytest.mark.skipif(
+    environ.GOOGLE_APPLICATION_CREDENTIALS is None,
+    reason="GCP credentials not set",
+)
+def test_parsing_loader():
+    model = LuxonisModel("tests/configs/segmentation_parse_loader.yaml")
+    model.train()
+
+
+@pytest.mark.skipif(
+    sys.platform == "win32",
+    reason="Tuning not supported on Windows",
+)
+def test_tune(opts: dict[str, Any], coco_dataset: LuxonisDataset):
+    opts["tuner.params"] = {
+        "trainer.optimizer.name_categorical": ["Adam", "SGD"],
+        "trainer.optimizer.params.lr_float": [0.0001, 0.001],
+        "trainer.batch_size_int": [4, 16, 4],
+        "trainer.preprocessing.augmentations_subset": [
+            ["Defocus", "Sharpen", "Flip", "Normalize", "invalid"],
+            2,
+        ],
+        "model.losses.0.weight_uniform": [0.1, 0.9],
+        "model.nodes.0.freezing.unfreeze_after_loguniform": [0.1, 0.9],
+    }
+    opts["loader.params.dataset_name"] = coco_dataset.identifier
+    model = LuxonisModel("configs/example_tuning.yaml", opts)
+    model.tune()
+    assert STUDY_PATH.exists()
+
+
+def test_archive(test_output_dir: Path, coco_dataset: LuxonisDataset):
+    opts = {
+        "tracker.save_directory": str(test_output_dir),
+        "loader.params.dataset_name": coco_dataset.identifier,
+    }
+    model = LuxonisModel("tests/configs/archive_config.yaml", opts)
+    model.archive()
+    assert (
+        Path(
+            model.run_save_dir,
+            "archive",
+            model.cfg.archiver.name or model.cfg.model.name,
+        )
+        .with_suffix(".onnx.tar.xz")
+        .exists()
+    )
+
+
+def test_callbacks(opts: dict[str, Any], parking_lot_dataset: LuxonisDataset):
+    config_file = "tests/configs/parking_lot_config.yaml"
+    opts = deepcopy(opts)
+    del opts["trainer.callbacks"]
+    opts |= {
+        "trainer.use_rich_progress_bar": False,
+        "trainer.seed": 42,
+        "trainer.deterministic": "warn",
+        "trainer.callbacks": [
+            {
+                "name": "MetadataLogger",
+                "params": {
+                    "hyperparams": ["trainer.epochs", "trainer.batch_size"],
+                },
+            },
+            {"name": "TestOnTrainEnd"},
+            {"name": "UploadCheckpoint"},
+            {
+                "name": "ExportOnTrainEnd",
+            },
+            {
+                "name": "ArchiveOnTrainEnd",
+                "params": {"preferred_checkpoint": "loss"},
+            },
+        ],
+        "exporter.scale_values": [0.5, 0.5, 0.5],
+        "exporter.mean_values": [0.5, 0.5, 0.5],
+        "exporter.blobconverter.active": True,
+    }
+    opts["loader.params.dataset_name"] = parking_lot_dataset.identifier
+    model = LuxonisModel(config_file, opts)
+    model.train()
+
+
+def test_freezing(opts: dict[str, Any], coco_dataset: LuxonisDataset):
+    config_file = "configs/segmentation_model.yaml"
+    opts = deepcopy(opts)
+    opts |= {
+        "model.predefined_model.params": {
+            "head_params": {
+                "freezing": {
+                    "active": True,
+                    "unfreeze_after": 2,
+                },
+            }
+        }
+    }
+    opts["trainer.epochs"] = 3
+    opts["loader.params.dataset_name"] = coco_dataset.identifier
+    model = LuxonisModel(config_file, opts)
+    model.train()
diff --git a/tests/unittests/__init__.py b/tests/unittests/__init__.py
index f9269fdf..e69de29b 100644
--- a/tests/unittests/__init__.py
+++ b/tests/unittests/__init__.py
@@ -1,2 +0,0 @@
-# import warnings
-# warnings.filterwarnings("module", category=DeprecationWarning)
diff --git a/tests/unittests/test_assigners/__init__.py b/tests/unittests/test_assigners/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unittests/test_utils/test_assigners/test_atts_assigner.py b/tests/unittests/test_assigners/test_atts_assigner.py
similarity index 88%
rename from tests/unittests/test_utils/test_assigners/test_atts_assigner.py
rename to tests/unittests/test_assigners/test_atts_assigner.py
index a3801ebb..4ab6f939 100644
--- a/tests/unittests/test_utils/test_assigners/test_atts_assigner.py
+++ b/tests/unittests/test_assigners/test_atts_assigner.py
@@ -1,6 +1,6 @@
 import torch
 
-from luxonis_train.utils.assigners.atts_assigner import ATSSAssigner
+from luxonis_train.assigners import ATSSAssigner
 
 
 def test_init():
@@ -25,7 +25,12 @@ def test_forward():
     pred_bboxes = torch.rand(bs, n_anchors, 4)
 
     labels, bboxes, scores, mask, assigned_gt_idx = assigner.forward(
-        anchor_bboxes, n_level_bboxes, gt_labels, gt_bboxes, mask_gt, pred_bboxes
+        anchor_bboxes,
+        n_level_bboxes,
+        gt_labels,
+        gt_bboxes,
+        mask_gt,
+        pred_bboxes,
     )
 
     assert labels.shape == (bs, n_anchors)
@@ -59,7 +64,11 @@ def test_select_topk_candidates():
     )
 
     assert is_in_topk.shape == (batch_size, n_max_boxes, n_anchors)
-    assert topk_idxs.shape == (batch_size, n_max_boxes, topk * len(n_level_bboxes))
+    assert topk_idxs.shape == (
+        batch_size,
+        n_max_boxes,
+        topk * len(n_level_bboxes),
+    )
 
 
 def test_get_positive_samples():
@@ -97,7 +106,11 @@ def test_get_final_assignments():
     assigned_gt_idx = torch.randint(0, n_max_boxes, (batch_size, n_anchors))
     mask_pos_sum = torch.randint(0, 2, (batch_size, n_anchors))
 
-    assigned_labels, assigned_bboxes, assigned_scores = assigner._get_final_assignments(
+    (
+        assigned_labels,
+        assigned_bboxes,
+        assigned_scores,
+    ) = assigner._get_final_assignments(
         gt_labels, gt_bboxes, assigned_gt_idx, mask_pos_sum
     )
 
diff --git a/tests/unittests/test_assigners/test_tal_assigner.py b/tests/unittests/test_assigners/test_tal_assigner.py
new file mode 100644
index 00000000..cb94b62d
--- /dev/null
+++ b/tests/unittests/test_assigners/test_tal_assigner.py
@@ -0,0 +1,135 @@
+import torch
+
+from luxonis_train.assigners import TaskAlignedAssigner
+
+
+def test_init():
+    assigner = TaskAlignedAssigner(
+        n_classes=80, topk=13, alpha=1.0, beta=6.0, eps=1e-9
+    )
+    assert assigner.n_classes == 80
+    assert assigner.topk == 13
+    assert assigner.alpha == 1.0
+    assert assigner.beta == 6.0
+    assert assigner.eps == 1e-9
+
+
+def test_forward():
+    batch_size = 10
+    n_anchors = 100
+    n_max_boxes = 5
+    n_classes = 80
+
+    assigner = TaskAlignedAssigner(n_classes=n_classes, topk=13)
+
+    # Create mock inputs
+    pred_scores = torch.rand(batch_size, n_anchors, 1)
+    pred_bboxes = torch.rand(batch_size, n_anchors, 4)
+    anchor_points = torch.rand(n_anchors, 2)
+    gt_labels = torch.rand(batch_size, n_max_boxes, 1)
+    gt_bboxes = torch.zeros(batch_size, n_max_boxes, 4)  # no gt bboxes
+    mask_gt = torch.rand(batch_size, n_max_boxes, 1)
+
+    labels, bboxes, scores, mask, assigned_gt_idx = assigner.forward(
+        pred_scores, pred_bboxes, anchor_points, gt_labels, gt_bboxes, mask_gt
+    )
+
+    assert labels.shape == (batch_size, n_anchors)
+    assert bboxes.shape == (batch_size, n_anchors, 4)
+    assert scores.shape == (
+        batch_size,
+        n_anchors,
+        n_classes,
+    )
+    assert mask.shape == (batch_size, n_anchors)
+    assert assigned_gt_idx.shape == (batch_size, n_anchors)
+
+    # Labels should be `n_classes` as there are no GT boxes
+    assert labels.unique().tolist() == [n_classes]
+
+    # All results should be zero as there are no GT boxes
+    assert torch.equal(bboxes, torch.zeros_like(bboxes))
+    assert torch.equal(scores, torch.zeros_like(scores))
+    assert torch.equal(mask, torch.zeros_like(mask))
+    assert torch.equal(assigned_gt_idx, torch.zeros_like(assigned_gt_idx))
+
+
+def test_get_alignment_metric():
+    batch_size = 2
+    n_anchors = 5
+    n_max_boxes = 3
+    n_classes = 80
+
+    pred_scores = torch.rand(batch_size, n_anchors, n_classes)
+    pred_bboxes = torch.rand(batch_size, n_anchors, 4)
+    gt_labels = torch.randint(0, n_classes, (batch_size, n_max_boxes, 1))
+    gt_bboxes = torch.rand(batch_size, n_max_boxes, 4)
+
+    assigner = TaskAlignedAssigner(
+        n_classes=n_classes, topk=13, alpha=1.0, beta=6.0, eps=1e-9
+    )
+    assigner.bs = pred_scores.size(0)
+    assigner.n_max_boxes = gt_bboxes.size(1)
+
+    align_metric, overlaps = assigner._get_alignment_metric(
+        pred_scores, pred_bboxes, gt_labels, gt_bboxes
+    )
+
+    assert align_metric.shape == (batch_size, n_max_boxes, n_anchors)
+    assert overlaps.shape == (batch_size, n_max_boxes, n_anchors)
+    assert align_metric.dtype == torch.float32
+    assert overlaps.dtype == torch.float32
+    assert align_metric.min() >= 0 and align_metric.max() <= 1
+    assert overlaps.min() >= 0 and overlaps.max() <= 1
+
+
+def test_select_topk_candidates():
+    batch_size = 2
+    n_max_boxes = 3
+    n_anchors = 5
+    topk = 2
+
+    metrics = torch.rand(batch_size, n_max_boxes, n_anchors)
+    mask_gt = torch.rand(batch_size, n_max_boxes, 1)
+
+    assigner = TaskAlignedAssigner(n_classes=80, topk=topk)
+
+    is_in_topk = assigner._select_topk_candidates(metrics)
+    topk_mask = mask_gt.repeat([1, 1, topk]).bool()
+    assert torch.equal(
+        assigner._select_topk_candidates(metrics),
+        assigner._select_topk_candidates(metrics, topk_mask=topk_mask),
+    )
+    assert is_in_topk.shape == (batch_size, n_max_boxes, n_anchors)
+    assert is_in_topk.dtype == torch.float32
+
+    assert is_in_topk.sum(dim=-1).max() <= topk
+
+
+def test_get_final_assignments():
+    batch_size = 2
+    n_max_boxes = 3
+    n_anchors = 5
+    n_classes = 80
+
+    gt_labels = torch.randint(0, n_classes, (batch_size, n_max_boxes, 1))
+    gt_bboxes = torch.rand(batch_size, n_max_boxes, 4)
+    assigned_gt_idx = torch.randint(0, n_max_boxes, (batch_size, n_anchors))
+    mask_pos_sum = torch.randint(0, 2, (batch_size, n_anchors))
+
+    assigner = TaskAlignedAssigner(n_classes=n_classes, topk=13)
+    assigner.bs = batch_size  # Set batch size
+    assigner.n_max_boxes = gt_bboxes.size(1)
+
+    (
+        assigned_labels,
+        assigned_bboxes,
+        assigned_scores,
+    ) = assigner._get_final_assignments(
+        gt_labels, gt_bboxes, assigned_gt_idx, mask_pos_sum
+    )
+
+    assert assigned_labels.shape == (batch_size, n_anchors)
+    assert assigned_bboxes.shape == (batch_size, n_anchors, 4)
+    assert assigned_scores.shape == (batch_size, n_anchors, n_classes)
+    assert assigned_labels.min() >= 0 and assigned_labels.max() <= n_classes
diff --git a/tests/unittests/test_utils/test_assigners/test_utils.py b/tests/unittests/test_assigners/test_utils.py
similarity index 96%
rename from tests/unittests/test_utils/test_assigners/test_utils.py
rename to tests/unittests/test_assigners/test_utils.py
index bf849e25..d10e1d47 100644
--- a/tests/unittests/test_utils/test_assigners/test_utils.py
+++ b/tests/unittests/test_assigners/test_utils.py
@@ -1,6 +1,6 @@
 import torch
 
-from luxonis_train.utils.assigners.utils import (
+from luxonis_train.assigners.utils import (
     batch_iou,
     candidates_in_gt,
     fix_collisions,
diff --git a/tests/unittests/test_base_attached_module.py b/tests/unittests/test_base_attached_module.py
new file mode 100644
index 00000000..c6ffdd48
--- /dev/null
+++ b/tests/unittests/test_base_attached_module.py
@@ -0,0 +1,153 @@
+import pytest
+from luxonis_ml.data import LabelType
+
+from luxonis_train import BaseLoss, BaseNode
+from luxonis_train.utils.exceptions import IncompatibleException
+
+
+class DummyBackbone(BaseNode):
+    def forward(self, _): ...
+
+
+class DummySegmentationHead(BaseNode):
+    tasks = [LabelType.SEGMENTATION]
+
+    def forward(self, _): ...
+
+
+class DummyBBoxHead(BaseNode):
+    tasks = [LabelType.BOUNDINGBOX]
+
+    def forward(self, _): ...
+
+
+class DummyDetectionHead(BaseNode):
+    tasks = [LabelType.BOUNDINGBOX, LabelType.KEYPOINTS]
+
+    def forward(self, _): ...
+
+
+class DummyLoss(BaseLoss):
+    supported_labels = [
+        LabelType.SEGMENTATION,
+        (LabelType.KEYPOINTS, LabelType.BOUNDINGBOX),
+    ]
+
+    def forward(self, _): ...
+
+
+class NoLabelLoss(BaseLoss):
+    def forward(self, _): ...
+
+
+@pytest.fixture
+def labels():
+    return {
+        "segmentation": ("segmentation", LabelType.SEGMENTATION),
+        "keypoints": ("keypoints", LabelType.KEYPOINTS),
+        "boundingbox": ("boundingbox", LabelType.BOUNDINGBOX),
+        "classification": ("classification", LabelType.CLASSIFICATION),
+    }
+
+
+@pytest.fixture
+def inputs():
+    return {
+        "features": ["features"],
+        "segmentation": ["segmentation"],
+    }
+
+
+def test_valid_properties():
+    head = DummySegmentationHead()
+    loss = DummyLoss(node=head)
+    no_labels_loss = NoLabelLoss(node=head)
+    assert loss.node == head
+    assert loss.node_tasks == {LabelType.SEGMENTATION: "segmentation"}
+    assert loss.required_labels == [LabelType.SEGMENTATION]
+    assert no_labels_loss.node == head
+    assert no_labels_loss.node_tasks == {
+        LabelType.SEGMENTATION: "segmentation"
+    }
+    assert no_labels_loss.required_labels == []
+
+
+def test_invalid_properties():
+    backbone = DummyBackbone()
+    with pytest.raises(IncompatibleException):
+        DummyLoss(node=backbone)
+    with pytest.raises(IncompatibleException):
+        DummyLoss(node=DummyBBoxHead())
+    with pytest.raises(RuntimeError):
+        _ = DummyLoss().node
+    with pytest.raises(RuntimeError):
+        _ = NoLabelLoss(node=backbone).node_tasks
+
+
+def test_get_label(labels):
+    seg_head = DummySegmentationHead()
+    det_head = DummyDetectionHead()
+    seg_loss = DummyLoss(node=seg_head)
+    assert seg_loss.get_label(labels) == "segmentation"
+    assert seg_loss.get_label(labels, LabelType.SEGMENTATION) == "segmentation"
+
+    del labels["segmentation"]
+    labels["segmentation-task"] = ("segmentation", LabelType.SEGMENTATION)
+
+    with pytest.raises(IncompatibleException):
+        seg_loss.get_label(labels)
+
+    det_loss = DummyLoss(node=det_head)
+    assert det_loss.get_label(labels, LabelType.KEYPOINTS) == "keypoints"
+    assert det_loss.get_label(labels, LabelType.BOUNDINGBOX) == "boundingbox"
+
+    with pytest.raises(ValueError):
+        det_loss.get_label(labels)
+
+    with pytest.raises(ValueError):
+        det_loss.get_label(labels, LabelType.SEGMENTATION)
+
+
+def test_input_tensors(inputs):
+    seg_head = DummySegmentationHead()
+    seg_loss = DummyLoss(node=seg_head)
+    assert seg_loss.get_input_tensors(inputs) == ["segmentation"]
+    assert seg_loss.get_input_tensors(inputs, "segmentation") == [
+        "segmentation"
+    ]
+    assert seg_loss.get_input_tensors(inputs, LabelType.SEGMENTATION) == [
+        "segmentation"
+    ]
+
+    with pytest.raises(IncompatibleException):
+        seg_loss.get_input_tensors(inputs, LabelType.KEYPOINTS)
+    with pytest.raises(IncompatibleException):
+        seg_loss.get_input_tensors(inputs, "keypoints")
+
+    det_head = DummyDetectionHead()
+    det_loss = DummyLoss(node=det_head)
+    with pytest.raises(ValueError):
+        det_loss.get_input_tensors(inputs)
+
+
+def test_prepare(inputs, labels):
+    backbone = DummyBackbone()
+    seg_head = DummySegmentationHead()
+    seg_loss = DummyLoss(node=seg_head)
+    det_head = DummyDetectionHead()
+
+    assert seg_loss.prepare(inputs, labels) == ("segmentation", "segmentation")
+    inputs["segmentation"].append("segmentation2")
+    assert seg_loss.prepare(inputs, labels) == (
+        "segmentation2",
+        "segmentation",
+    )
+
+    with pytest.raises(RuntimeError):
+        NoLabelLoss(node=backbone).prepare(inputs, labels)
+
+    with pytest.raises(RuntimeError):
+        NoLabelLoss(node=seg_head).prepare(inputs, labels)
+
+    with pytest.raises(RuntimeError):
+        DummyLoss(node=det_head).prepare(inputs, labels)
diff --git a/tests/unittests/test_base_node.py b/tests/unittests/test_base_node.py
new file mode 100644
index 00000000..68386f73
--- /dev/null
+++ b/tests/unittests/test_base_node.py
@@ -0,0 +1,160 @@
+import pytest
+import torch
+from luxonis_ml.data import LabelType
+from torch import Size, Tensor
+
+from luxonis_train.nodes import AttachIndexType, BaseNode
+from luxonis_train.utils import DatasetMetadata, Packet
+from luxonis_train.utils.exceptions import IncompatibleException
+
+
+class DummyNode(BaseNode, register=False):
+    def forward(self, _): ...
+
+
+@pytest.fixture
+def packet() -> Packet[Tensor]:
+    return {
+        "features": [torch.rand(3, 224, 224)],
+    }
+
+
+@pytest.mark.parametrize(
+    ("attach_index", "expected"),
+    [
+        (-1, 5),
+        (0, 1),
+        ("all", [1, 2, 3, 4, 5]),
+        ((0, 2), [1, 2]),
+        ((0, 4, 2), [1, 3]),
+        ((-1, -3, -1), [5, 4]),
+        ((4, 2), [5, 4]),
+        ((-1, -3), [5, 4]),
+        ((-4, 4), [2, 3, 4]),
+        ((1, -1), [2, 3, 4]),
+    ],
+)
+def test_attach_index(
+    attach_index: AttachIndexType, expected: list[int] | int
+):
+    lst = [1, 2, 3, 4, 5]
+
+    class DummyBaseNode:
+        attach_index: AttachIndexType
+
+    DummyBaseNode.attach_index = attach_index
+
+    assert BaseNode.get_attached(DummyBaseNode, lst) == expected  # type: ignore
+
+
+def test_attach_index_error():
+    lst = [1, 2, 3, 4, 5]
+
+    class DummyNode(BaseNode, register=False):
+        attach_index: AttachIndexType
+
+    with pytest.raises(ValueError):
+        DummyNode.attach_index = 10
+        BaseNode.get_attached(DummyNode, lst)  # type: ignore
+
+    with pytest.raises(ValueError):
+        DummyNode.attach_index = "none"  # type: ignore
+        BaseNode.get_attached(DummyNode, lst)  # type: ignore
+
+
+def test_invalid(packet: Packet[Tensor]):
+    node = DummyNode()
+    with pytest.raises(RuntimeError):
+        _ = node.input_shapes
+    with pytest.raises(RuntimeError):
+        _ = node.original_in_shape
+    with pytest.raises(RuntimeError):
+        _ = node.dataset_metadata
+    with pytest.raises(ValueError):
+        node.unwrap([packet, packet])
+    with pytest.raises(ValueError):
+        node.wrap({"inp": torch.rand(3, 224, 224)})
+
+
+def tets_in_sizes():
+    node = DummyNode(
+        input_shapes=[{"features": [Size((3, 224, 224)) for _ in range(3)]}]
+    )
+    assert node.in_sizes == [Size((3, 224, 224)) for _ in range(3)]
+    node = DummyNode(in_sizes=Size((3, 224, 224)))
+    assert node.in_sizes == Size((3, 224, 224))
+    with pytest.raises(RuntimeError):
+        node = DummyNode(input_shapes=[{"feats": [Size((3, 224, 224))]}])
+        _ = node.in_sizes
+
+
+def test_check_type_override():
+    class DummyNode(BaseNode, register=False):
+        in_channels: int
+
+        def forward(self, _): ...
+
+    with pytest.raises(IncompatibleException):
+        DummyNode(
+            input_shapes=[
+                {"features": [Size((3, 224, 224)) for _ in range(3)]}
+            ]
+        )
+
+
+def test_tasks():
+    class DummyHead(DummyNode):
+        tasks = [LabelType.CLASSIFICATION]
+
+    class DummyMultiHead(DummyNode):
+        tasks = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+
+    dummy_head = DummyHead()
+    dummy_node = DummyNode()
+    dummy_multi_head = DummyMultiHead(n_keypoints=4)
+    assert (
+        dummy_head.get_task_name(LabelType.CLASSIFICATION) == "classification"
+    )
+    assert dummy_head.task == "classification"
+    with pytest.raises(ValueError):
+        dummy_head.get_task_name(LabelType.SEGMENTATION)
+
+    with pytest.raises(RuntimeError):
+        dummy_node.get_task_name(LabelType.SEGMENTATION)
+
+    with pytest.raises(RuntimeError):
+        _ = dummy_node.task
+
+    with pytest.raises(ValueError):
+        _ = dummy_multi_head.task
+
+    metadata = DatasetMetadata(
+        classes={
+            "segmentation": ["car", "person", "dog"],
+            "classification": ["car-class", "person-class"],
+        },
+        n_keypoints={"color-segmentation": 0, "detection": 0},
+    )
+
+    dummy_multi_head._dataset_metadata = metadata
+    assert dummy_multi_head.get_class_names(LabelType.SEGMENTATION) == [
+        "car",
+        "person",
+        "dog",
+    ]
+    assert dummy_multi_head.get_class_names(LabelType.CLASSIFICATION) == [
+        "car-class",
+        "person-class",
+    ]
+    assert dummy_multi_head.get_n_classes(LabelType.SEGMENTATION) == 3
+    assert dummy_multi_head.get_n_classes(LabelType.CLASSIFICATION) == 2
+    assert dummy_multi_head.n_keypoints == 4
+    with pytest.raises(ValueError):
+        _ = dummy_head.n_keypoints
+    with pytest.raises(RuntimeError):
+        _ = dummy_node.n_keypoints
+
+    dummy_head = DummyHead(n_classes=5)
+    assert dummy_head.n_classes == 5
+    with pytest.raises(ValueError):
+        _ = dummy_multi_head.n_classes
diff --git a/tests/unittests/test_blocks.py b/tests/unittests/test_blocks.py
new file mode 100644
index 00000000..8b6110d4
--- /dev/null
+++ b/tests/unittests/test_blocks.py
@@ -0,0 +1,15 @@
+import torch
+
+from luxonis_train.nodes.blocks import SqueezeExciteBlock, autopad
+
+
+def test_autopad():
+    assert autopad(1, 2) == 2
+    assert autopad(2) == 1
+    assert autopad((2, 4)) == (1, 2)
+
+
+def test_squeeze_excite_block():
+    se_block = SqueezeExciteBlock(64, 32)
+    x = torch.rand(1, 64, 224, 224)
+    assert se_block(x).shape == (1, 64, 224, 224)
diff --git a/tests/unittests/test_callbacks/__init__.py b/tests/unittests/test_callbacks/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unittests/test_callbacks/test_needs_checkpoint.py b/tests/unittests/test_callbacks/test_needs_checkpoint.py
new file mode 100644
index 00000000..bd296dea
--- /dev/null
+++ b/tests/unittests/test_callbacks/test_needs_checkpoint.py
@@ -0,0 +1,6 @@
+from luxonis_train.callbacks.needs_checkpoint import NeedsCheckpoint
+
+
+def test_other_type():
+    assert NeedsCheckpoint._get_other_type("loss") == "metric"
+    assert NeedsCheckpoint._get_other_type("metric") == "loss"
diff --git a/tests/unittests/test_loaders/__init__.py b/tests/unittests/test_loaders/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unittests/test_loaders/test_base_loader.py b/tests/unittests/test_loaders/test_base_loader.py
new file mode 100644
index 00000000..dee1ecef
--- /dev/null
+++ b/tests/unittests/test_loaders/test_base_loader.py
@@ -0,0 +1,94 @@
+import pytest
+import torch
+from luxonis_ml.data import LabelType
+from torch import Size
+
+from luxonis_train.loaders import collate_fn
+
+
+@pytest.mark.parametrize(
+    "input_names_and_shapes",
+    [
+        [("features", Size([3, 224, 224]))],
+        [
+            ("features", Size([3, 224, 224])),
+            ("segmentation", Size([1, 224, 224])),
+        ],
+        [
+            ("features", Size([3, 224, 224])),
+            ("segmentation", Size([1, 224, 224])),
+            ("disparity", Size([1, 224, 224])),
+        ],
+        [
+            ("features", Size([3, 224, 224])),
+            ("pointcloud", Size([1000, 3])),
+        ],
+        [
+            ("features", Size([3, 224, 224])),
+            ("pointcloud", Size([1000, 3])),
+            ("foobar", Size([2, 3, 4, 5, 6])),
+        ],
+    ],
+)
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_collate_fn(
+    input_names_and_shapes: list[tuple[str, Size]], batch_size: int, subtests
+):
+    def build_batch_element():
+        inputs = {}
+        for name, shape in input_names_and_shapes:
+            inputs[name] = torch.rand(shape, dtype=torch.float32)
+
+        labels = {
+            "classification": (
+                torch.randint(0, 2, (2,), dtype=torch.int64),
+                LabelType.CLASSIFICATION,
+            ),
+            "segmentation": (
+                torch.randint(0, 2, (1, 224, 224), dtype=torch.int64),
+                LabelType.SEGMENTATION,
+            ),
+            "keypoints": (
+                torch.rand(1, 52, dtype=torch.float32),
+                LabelType.KEYPOINTS,
+            ),
+            "boundingbox": (
+                torch.rand(1, 5, dtype=torch.float32),
+                LabelType.BOUNDINGBOX,
+            ),
+        }
+
+        return inputs, labels
+
+    batch = [build_batch_element() for _ in range(batch_size)]
+
+    inputs, annotations = collate_fn(batch)  # type: ignore
+
+    with subtests.test("inputs"):
+        assert inputs["features"].shape == (batch_size, 3, 224, 224)
+        assert inputs["features"].dtype == torch.float32
+
+    with subtests.test("classification"):
+        assert "classification" in annotations
+        assert annotations["classification"][0].shape == (batch_size, 2)
+        assert annotations["classification"][0].dtype == torch.int64
+
+    with subtests.test("segmentation"):
+        assert "segmentation" in annotations
+        assert annotations["segmentation"][0].shape == (
+            batch_size,
+            1,
+            224,
+            224,
+        )
+        assert annotations["segmentation"][0].dtype == torch.int64
+
+    with subtests.test("keypoints"):
+        assert "keypoints" in annotations
+        assert annotations["keypoints"][0].shape == (batch_size, 53)
+        assert annotations["keypoints"][0].dtype == torch.float32
+
+    with subtests.test("boundingbox"):
+        assert "boundingbox" in annotations
+        assert annotations["boundingbox"][0].shape == (batch_size, 6)
+        assert annotations["boundingbox"][0].dtype == torch.float32
diff --git a/tests/unittests/test_losses/test_bce_with_logits_loss.py b/tests/unittests/test_losses/test_bce_with_logits_loss.py
index 27871019..f94b5cb1 100644
--- a/tests/unittests/test_losses/test_bce_with_logits_loss.py
+++ b/tests/unittests/test_losses/test_bce_with_logits_loss.py
@@ -16,7 +16,9 @@ def test_forward_pass():
             predictions = torch.full([bs, n_cl], 1.5)  # logit
             loss_fn = BCEWithLogitsLoss()
 
-            loss = loss_fn.forward(predictions, targets)  # -log(sigmoid(1.5)) = 0.2014
+            loss = loss_fn.forward(
+                predictions, targets
+            )  # -log(sigmoid(1.5)) = 0.2014
 
             assert isinstance(loss, torch.Tensor)
             assert loss.shape == torch.Size([])
@@ -57,5 +59,7 @@ def test_weights():
     assert loss_weight != loss_no_weight
 
 
-if __name__ == "__main__":
-    pytest.main()
+def test_invalid():
+    loss_fn = BCEWithLogitsLoss()
+    with pytest.raises(RuntimeError):
+        loss_fn.forward(torch.rand(10, 10), torch.rand(15, 15))
diff --git a/tests/unittests/test_metrics/test_torchmetrics.py b/tests/unittests/test_metrics/test_torchmetrics.py
new file mode 100644
index 00000000..141a3785
--- /dev/null
+++ b/tests/unittests/test_metrics/test_torchmetrics.py
@@ -0,0 +1,52 @@
+import pytest
+import torchmetrics
+from luxonis_ml.data import LabelType
+
+from luxonis_train.attached_modules.metrics.torchmetrics import (
+    TorchMetricWrapper,
+)
+from luxonis_train.nodes import BaseNode
+
+
+def test_torchmetrics():
+    class DummyNode(BaseNode):
+        tasks = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+
+        def forward(self, _): ...
+
+    class DummyMetric(TorchMetricWrapper):
+        supported_labels = [LabelType.CLASSIFICATION, LabelType.SEGMENTATION]
+        Metric = torchmetrics.Accuracy
+
+    node_1_class = DummyNode(n_classes=1)
+    node_2_classes = DummyNode(n_classes=2)
+    node = DummyNode()
+    assert DummyMetric(node=node_1_class)._task == "binary"
+    assert DummyMetric(node=node_2_classes)._task == "multiclass"
+    assert DummyMetric(node=node_2_classes, task="multilabel")
+    assert DummyMetric(num_classes=1)._task == "binary"
+    assert DummyMetric(num_classes=2)._task == "multiclass"
+    assert DummyMetric(num_labels=2)._task == "multilabel"
+
+    assert DummyMetric(task="binary")
+
+    with pytest.raises(ValueError):
+        DummyMetric()
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="multiclass")
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="invalid")
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="binary", node=node_2_classes)
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="multiclass", node=node_1_class)
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="multiclass", node=node)
+
+    with pytest.raises(ValueError):
+        DummyMetric(task="multilabel", node=node)
diff --git a/tests/unittests/test_utils/test_assigners/test_tal_assigner.py b/tests/unittests/test_utils/test_assigners/test_tal_assigner.py
deleted file mode 100644
index 8f291615..00000000
--- a/tests/unittests/test_utils/test_assigners/test_tal_assigner.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import torch
-
-from luxonis_train.utils.assigners.tal_assigner import TaskAlignedAssigner
-
-
-def test_init():
-    assigner = TaskAlignedAssigner(n_classes=80, topk=13, alpha=1.0, beta=6.0, eps=1e-9)
-    assert assigner.n_classes == 80
-    assert assigner.topk == 13
-    assert assigner.alpha == 1.0
-    assert assigner.beta == 6.0
-    assert assigner.eps == 1e-9
-
-
-def test_forward():
-    # Constants for clarity
-    batch_size = 10
-    num_anchors = 100
-    num_max_boxes = 5
-    num_classes = 80
-
-    # Initialize the TaskAlignedAssigner
-    assigner = TaskAlignedAssigner(n_classes=num_classes, topk=13)
-
-    # Create mock inputs
-    pred_scores = torch.rand(batch_size, num_anchors, 1)
-    pred_bboxes = torch.rand(batch_size, num_anchors, 4)
-    anchor_points = torch.rand(num_anchors, 2)
-    gt_labels = torch.rand(batch_size, num_max_boxes, 1)
-    gt_bboxes = torch.zeros(batch_size, num_max_boxes, 4)  # no gt bboxes
-    mask_gt = torch.rand(batch_size, num_max_boxes, 1)
-
-    # Call the forward method
-    labels, bboxes, scores, mask, assigned_gt_idx = assigner.forward(
-        pred_scores, pred_bboxes, anchor_points, gt_labels, gt_bboxes, mask_gt
-    )
-
-    # Assert the expected outcomes
-    assert labels.shape == (batch_size, num_anchors)
-    assert labels.unique().tolist() == [
-        num_classes
-    ]  # All labels should be num_classes as there are no GT boxes
-    assert bboxes.shape == (batch_size, num_anchors, 4)
-    assert torch.equal(
-        bboxes, torch.zeros_like(bboxes)
-    )  # All bboxes should be zero as there are no GT boxes
-    assert (
-        scores.shape
-        == (
-            batch_size,
-            num_anchors,
-            num_classes,
-        )
-    )  # TODO: We have this in doc string: Returns: ... assigned scores of shape [bs, n_anchors, 1],
-    # it returns tensor of shape [bs, n_anchors, n_classes] instead
-    assert torch.equal(
-        scores, torch.zeros_like(scores)
-    )  # All scores should be zero as there are no GT boxes
-    assert mask.shape == (batch_size, num_anchors)
-    assert torch.equal(
-        mask, torch.zeros_like(mask)
-    )  # All mask values should be zero as there are no GT boxes
-    assert assigned_gt_idx.shape == (batch_size, num_anchors)
-    assert torch.equal(
-        assigned_gt_idx, torch.zeros_like(assigned_gt_idx)
-    )  # All assigned_gt_idx values should be zero as there are no GT boxes
-
-
-def test_get_alignment_metric():
-    # Create mock inputs
-    bs = 2  # batch size
-    n_anchors = 5
-    n_max_boxes = 3
-    n_classes = 80
-
-    pred_scores = torch.rand(
-        bs, n_anchors, n_classes
-    )  # TODO: Same issue: works with n_classes instead of 1, change it in the doc string in the method itself!!!
-    pred_bboxes = torch.rand(bs, n_anchors, 4)
-    gt_labels = torch.randint(0, n_classes, (bs, n_max_boxes, 1))
-    gt_bboxes = torch.rand(bs, n_max_boxes, 4)
-
-    # Initialize the TaskAlignedAssigner
-    assigner = TaskAlignedAssigner(
-        n_classes=n_classes, topk=13, alpha=1.0, beta=6.0, eps=1e-9
-    )
-    assigner.bs = pred_scores.size(0)
-    assigner.n_max_boxes = gt_bboxes.size(1)
-
-    # Call the method
-    align_metric, overlaps = assigner._get_alignment_metric(
-        pred_scores, pred_bboxes, gt_labels, gt_bboxes
-    )
-
-    # Assert the expected outcomes
-    assert align_metric.shape == (bs, n_max_boxes, n_anchors)
-    assert overlaps.shape == (bs, n_max_boxes, n_anchors)
-    assert align_metric.dtype == torch.float32
-    assert overlaps.dtype == torch.float32
-    assert (align_metric >= 0).all() and (
-        align_metric <= 1
-    ).all()  # Alignment metric should be in the range [0, 1]
-    assert (overlaps >= 0).all() and (
-        overlaps <= 1
-    ).all()  # IoU should be in the range [0, 1]
-
-
-def test_select_topk_candidates():
-    # Constants for the test
-    batch_size = 2
-    num_max_boxes = 3
-    num_anchors = 5
-    topk = 2
-
-    metrics = torch.rand(batch_size, num_max_boxes, num_anchors)
-    mask_gt = torch.rand(batch_size, num_max_boxes, 1)
-
-    # Initialize the TaskAlignedAssigner
-    assigner = TaskAlignedAssigner(n_classes=80, topk=topk)
-
-    # Call the method
-    is_in_topk = assigner._select_topk_candidates(
-        metrics,
-    )
-    topk_mask = mask_gt.repeat([1, 1, topk]).bool()
-    assert torch.equal(
-        assigner._select_topk_candidates(metrics),
-        assigner._select_topk_candidates(metrics, topk_mask=topk_mask),
-    )
-    # Assert the expected outcomes
-    assert is_in_topk.shape == (batch_size, num_max_boxes, num_anchors)
-    assert is_in_topk.dtype == torch.float32
-
-    # Check that each ground truth has at most 'topk' anchors selected
-    assert (is_in_topk.sum(dim=-1) <= topk).all()
-
-
-def test_get_final_assignments():
-    # Constants for the test
-    batch_size = 2
-    num_max_boxes = 3
-    num_anchors = 5
-    num_classes = 80
-
-    # Mock inputs
-    gt_labels = torch.randint(0, num_classes, (batch_size, num_max_boxes, 1))
-    gt_bboxes = torch.rand(batch_size, num_max_boxes, 4)
-    assigned_gt_idx = torch.randint(0, num_max_boxes, (batch_size, num_anchors))
-    mask_pos_sum = torch.randint(0, 2, (batch_size, num_anchors))
-
-    # Initialize the TaskAlignedAssigner
-    assigner = TaskAlignedAssigner(n_classes=num_classes, topk=13)
-    assigner.bs = batch_size  # Set batch size
-    assigner.n_max_boxes = gt_bboxes.size(1)
-
-    # Call the method
-    assigned_labels, assigned_bboxes, assigned_scores = assigner._get_final_assignments(
-        gt_labels, gt_bboxes, assigned_gt_idx, mask_pos_sum
-    )
-
-    # Assert the expected outcomes
-    assert assigned_labels.shape == (batch_size, num_anchors)
-    assert assigned_bboxes.shape == (batch_size, num_anchors, 4)
-    assert assigned_scores.shape == (batch_size, num_anchors, num_classes)
-    assert (assigned_labels >= 0).all() and (assigned_labels <= num_classes).all()
diff --git a/tests/unittests/test_utils/test_boxutils.py b/tests/unittests/test_utils/test_boxutils.py
index 2cb3df24..2b05a428 100644
--- a/tests/unittests/test_utils/test_boxutils.py
+++ b/tests/unittests/test_utils/test_boxutils.py
@@ -1,39 +1,42 @@
+import pytest
 import torch
 
-from luxonis_train.utils.boxutils import (
+from luxonis_train.utils.boundingbox import (
+    IoUType,
     anchors_for_fpn_features,
     bbox2dist,
     bbox_iou,
     compute_iou_loss,
     dist2bbox,
     process_bbox_predictions,
-    process_keypoints_predictions,
 )
 
 
-def generate_random_bboxes(num_bboxes, max_width, max_height, format="xyxy"):
-    # Generate top-left corners (x1, y1)
-    x1y1 = torch.rand(num_bboxes, 2) * torch.tensor([max_width - 1, max_height - 1])
+def generate_random_bboxes(
+    n_bboxes: int, max_width: int, max_height: int, format: str = "xyxy"
+):
+    x1y1 = torch.rand(n_bboxes, 2) * torch.tensor(
+        [max_width - 1, max_height - 1]
+    )
 
-    # Generate widths and heights ensuring x2 > x1 and y2 > y1
     wh = (
-        torch.rand(num_bboxes, 2) * (torch.tensor([max_width, max_height]) - 1 - x1y1)
+        torch.rand(n_bboxes, 2)
+        * (torch.tensor([max_width, max_height]) - 1 - x1y1)
         + 1
     )
 
     if format == "xyxy":
-        # Calculate bottom-right corners (x2, y2) for xyxy format
         x2y2 = x1y1 + wh
         bboxes = torch.cat((x1y1, x2y2), dim=1)
     elif format == "xywh":
-        # Use x1y1 as top-left corner and wh as width and height for xywh format
         bboxes = torch.cat((x1y1, wh), dim=1)
     elif format == "cxcywh":
-        # Calculate center coordinates and use wh as width and height for cxcywh format
         cxcy = x1y1 + wh / 2
         bboxes = torch.cat((cxcy, wh), dim=1)
     else:
-        raise ValueError("Unsupported format. Choose from 'xyxy', 'xywh', 'cxcywh'.")
+        raise ValueError(
+            "Unsupported format. Choose from 'xyxy', 'xywh', 'cxcywh'."
+        )
 
     return bboxes
 
@@ -44,6 +47,8 @@ def test_dist2bbox():
     bbox = dist2bbox(distance, anchor_points)
 
     assert bbox.shape == distance.shape
+    with pytest.raises(ValueError):
+        dist2bbox(distance, anchor_points, out_format="invalid")  # type: ignore
 
 
 def test_bbox2dist():
@@ -56,22 +61,41 @@ def test_bbox2dist():
     assert distance.shape == bbox.shape
 
 
-def test_bbox_iou():
+@pytest.mark.parametrize("iou_type", ["none", "giou", "diou", "ciou", "siou"])
+def test_bbox_iou(iou_type: IoUType):
     for format in ["xyxy", "cxcywh", "xywh"]:
         bbox1 = generate_random_bboxes(5, 640, 640, format)
-        bbox2 = generate_random_bboxes(8, 640, 640, format)
-
-        iou = bbox_iou(bbox1, bbox2)
-
-        assert iou.shape == (5, 8)
-        assert iou.min() >= 0 and iou.max() <= 1
+        if iou_type == "siou":
+            bbox2 = generate_random_bboxes(5, 640, 640, format)
+        else:
+            bbox2 = generate_random_bboxes(8, 640, 640, format)
+
+        iou = bbox_iou(
+            bbox1,
+            bbox2,
+            bbox_format=format,  # type: ignore
+            iou_type=iou_type,
+        )
+
+        assert iou.shape == (bbox1.shape[0], bbox2.shape[0])
+        if iou_type == "none":
+            min = 0
+        else:
+            min = -1.5
+        assert iou.min() >= min and iou.max() <= 1
+
+    if iou_type == "none":
+        with pytest.raises(ValueError):
+            bbox_iou(bbox1, bbox2, iou_type="invalid")  # type: ignore
 
 
 def test_compute_iou_loss():
     pred_bboxes = generate_random_bboxes(8, 640, 640, "xyxy")
     target_bboxes = generate_random_bboxes(8, 640, 640, "xyxy")
 
-    loss_iou, iou = compute_iou_loss(pred_bboxes, target_bboxes, iou_type="giou")
+    loss_iou, iou = compute_iou_loss(
+        pred_bboxes, target_bboxes, iou_type="giou"
+    )
 
     assert isinstance(loss_iou, torch.Tensor)
     assert isinstance(iou, torch.Tensor)
@@ -93,21 +117,16 @@ def test_process_bbox_predictions():
     assert out_bbox_tail.shape == (10, 4)
 
 
-def test_process_keypoints_predictions():
-    keypoints = torch.rand(10, 15)  # 5 keypoints * 3 (x, y, visibility)
-
-    x, y, visibility = process_keypoints_predictions(keypoints)
-
-    assert x.shape == y.shape == visibility.shape == (10, 5)
-
-
 def test_anchors_for_fpn_features():
     features = [torch.rand(1, 256, 14, 14), torch.rand(1, 256, 28, 28)]
     strides = torch.tensor([8, 16])
 
-    anchors, anchor_points, n_anchors_list, stride_tensor = anchors_for_fpn_features(
-        features, strides
-    )
+    (
+        anchors,
+        anchor_points,
+        n_anchors_list,
+        stride_tensor,
+    ) = anchors_for_fpn_features(features, strides)
 
     assert isinstance(anchors, torch.Tensor)
     assert isinstance(anchor_points, torch.Tensor)
diff --git a/tests/unittests/test_utils/test_dataset_metadata.py b/tests/unittests/test_utils/test_dataset_metadata.py
new file mode 100644
index 00000000..8dba11a8
--- /dev/null
+++ b/tests/unittests/test_utils/test_dataset_metadata.py
@@ -0,0 +1,53 @@
+import pytest
+
+from luxonis_train.utils import DatasetMetadata
+
+
+@pytest.fixture
+def metadata():
+    return DatasetMetadata(
+        classes={
+            "color-segmentation": ["car", "person"],
+            "detection": ["car", "person"],
+        },
+        n_keypoints={"color-segmentation": 0, "detection": 0},
+    )
+
+
+def test_n_classes(metadata):
+    assert metadata.n_classes("color-segmentation") == 2
+    assert metadata.n_classes("detection") == 2
+    assert metadata.n_classes() == 2
+    with pytest.raises(ValueError):
+        metadata.n_classes("segmentation")
+    metadata._classes["segmentation"] = ["car", "person", "tree"]
+    with pytest.raises(RuntimeError):
+        metadata.n_classes()
+
+
+def test_n_keypoints(metadata):
+    assert metadata.n_keypoints("color-segmentation") == 0
+    assert metadata.n_keypoints("detection") == 0
+    assert metadata.n_keypoints() == 0
+    with pytest.raises(ValueError):
+        metadata.n_keypoints("segmentation")
+    metadata._n_keypoints["segmentation"] = 1
+    with pytest.raises(RuntimeError):
+        metadata.n_keypoints()
+
+
+def test_class_names(metadata):
+    assert metadata.classes("color-segmentation") == ["car", "person"]
+    assert metadata.classes("detection") == ["car", "person"]
+    assert metadata.classes() == ["car", "person"]
+    with pytest.raises(ValueError):
+        metadata.classes("segmentation")
+    metadata._classes["segmentation"] = ["car", "person", "tree"]
+    with pytest.raises(RuntimeError):
+        metadata.classes()
+
+
+def test_no_loader():
+    metadata = DatasetMetadata()
+    with pytest.raises(RuntimeError):
+        metadata.autogenerate_anchors(3)
diff --git a/tests/unittests/test_utils/test_general.py b/tests/unittests/test_utils/test_general.py
new file mode 100644
index 00000000..7f13f796
--- /dev/null
+++ b/tests/unittests/test_utils/test_general.py
@@ -0,0 +1,44 @@
+import pytest
+
+from luxonis_train.utils.general import infer_upscale_factor
+
+
+@pytest.mark.parametrize(
+    ("in_size", "orig_size", "expected"),
+    [
+        ((1, 1), (1, 1), 0),
+        ((1, 1), (2, 2), 1),
+        ((2, 2), (1, 1), -1),
+        ((2, 2), (4, 4), 1),
+        ((4, 4), (2, 2), -1),
+        ((4, 4), (8, 8), 1),
+        ((8, 8), (4, 4), -1),
+        ((2, 2), (16, 16), 3),
+        ((16, 16), (4, 4), -2),
+        (4, 8, 1),
+    ],
+)
+def test_infer_upscale_factor(
+    in_size: tuple[int, int] | int,
+    orig_size: tuple[int, int] | int,
+    expected: int,
+):
+    assert infer_upscale_factor(in_size, orig_size) == expected
+
+
+@pytest.mark.parametrize(
+    ("in_size", "orig_size"),
+    [
+        ((1, 1), (2, 1)),
+        ((1, 1), (1, 2)),
+        ((2, 3), (16, 16)),
+        ((3, 2), (16, 16)),
+        ((3, 3), (16, 16)),
+    ],
+)
+def test_infer_upscale_factor_fail(
+    in_size: tuple[int, int] | int,
+    orig_size: tuple[int, int] | int,
+):
+    with pytest.raises(ValueError):
+        infer_upscale_factor(in_size, orig_size)
diff --git a/tests/unittests/test_utils/test_graph.py b/tests/unittests/test_utils/test_graph.py
new file mode 100644
index 00000000..c63e4b72
--- /dev/null
+++ b/tests/unittests/test_utils/test_graph.py
@@ -0,0 +1,79 @@
+import pytest
+
+from luxonis_train.utils.graph import Graph, is_acyclic, traverse_graph
+
+
+@pytest.mark.parametrize(
+    ("graph", "acyclic"),
+    [
+        ({}, True),
+        ({"a": []}, True),
+        ({"a": ["b"], "b": ["a"]}, False),
+        ({"a": ["b"], "b": []}, True),
+        ({"a": ["b"], "b": ["c"], "c": ["a"]}, False),
+        ({"a": ["b"], "b": ["c"], "c": []}, True),
+        ({"a": ["b", "c"], "b": ["d"], "c": ["d"], "d": []}, True),
+        ({"a": ["b", "c"], "b": ["d"], "c": ["d"], "d": ["a"]}, False),
+    ],
+)
+def test_acyclic(graph: Graph, acyclic: bool):
+    assert is_acyclic(graph) == acyclic
+
+
+@pytest.mark.parametrize(
+    ("graph", "nodes", "expected"),
+    [
+        ({}, {}, []),
+        (
+            {"a": []},
+            {"a": 1},
+            [("a", 1, [], [])],
+        ),
+        (
+            {"a": ["b"], "b": []},
+            {"a": 1, "b": 2},
+            [("b", 2, [], ["a"]), ("a", 1, ["b"], [])],
+        ),
+        (
+            {"a": ["b"], "b": ["c"], "c": []},
+            {"a": 1, "b": 2, "c": 3},
+            [
+                ("c", 3, [], ["a", "b"]),
+                ("b", 2, ["c"], ["a"]),
+                ("a", 1, ["b"], []),
+            ],
+        ),
+        (
+            {"a": ["b", "c"], "b": ["d"], "c": ["d"], "d": []},
+            {"a": 1, "b": 2, "c": 3, "d": 4},
+            [
+                ("d", 4, [], ["a", "b", "c"]),
+                ("b", 2, ["d"], ["a", "c"]),
+                ("c", 3, ["d"], ["a"]),
+                ("a", 1, ["b", "c"], []),
+            ],
+        ),
+    ],
+)
+def test_traverse(
+    graph: Graph,
+    nodes: dict[str, int],
+    expected: list[tuple[str, int, list[str], list[str]]],
+):
+    result = list(traverse_graph(graph, nodes))
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("graph", "nodes"),
+    [
+        ({"a": ["b"], "b": ["a"]}, {"a": 1, "b": 2}),
+        (
+            {"a": ["b", "c"], "b": ["d"], "c": ["d"], "d": ["a"]},
+            {"a": 1, "b": 2, "c": 3, "d": 4},
+        ),
+    ],
+)
+def test_traverse_fail(graph: Graph, nodes: dict[str, int]):
+    with pytest.raises(RuntimeError):
+        list(traverse_graph(graph, nodes))
diff --git a/tests/unittests/test_utils/test_keypoints.py b/tests/unittests/test_utils/test_keypoints.py
new file mode 100644
index 00000000..3d20dae6
--- /dev/null
+++ b/tests/unittests/test_utils/test_keypoints.py
@@ -0,0 +1,24 @@
+import pytest
+import torch
+
+from luxonis_train.utils.keypoints import (
+    get_sigmas,
+    process_keypoints_predictions,
+)
+
+
+def test_get_sigmas():
+    sigmas = [0.1, 0.2, 0.3]
+    pytest.approx(get_sigmas(sigmas, 3).tolist(), sigmas)
+    with pytest.raises(ValueError):
+        get_sigmas(sigmas, 2)
+    assert len(get_sigmas(None, 17)) == 17
+    assert len(get_sigmas(None, 5)) == 5
+
+
+def test_process_keypoints_predictions():
+    keypoints = torch.tensor([[0.1, 0.2, 1.0, 0.4, 0.5, 0.0]])
+    x, y, visibility = process_keypoints_predictions(keypoints)
+    pytest.approx(x[0].tolist(), [0.1, 0.4])
+    pytest.approx(y[0].tolist(), [0.2, 0.5])
+    pytest.approx(visibility[0].tolist(), [1.0, 0.0])
diff --git a/tests/unittests/test_utils/test_loaders/test_base_loader.py b/tests/unittests/test_utils/test_loaders/test_base_loader.py
deleted file mode 100644
index 0209c192..00000000
--- a/tests/unittests/test_utils/test_loaders/test_base_loader.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import pytest
-import torch
-
-from luxonis_train.utils.loaders import collate_fn
-from luxonis_train.utils.types import LabelType
-
-
-@pytest.mark.parametrize(
-    "input_names_and_shapes",
-    [
-        [("features", torch.Size([3, 224, 224]))],
-        [
-            ("features", torch.Size([3, 224, 224])),
-            ("segmentation", torch.Size([1, 224, 224])),
-        ],
-        [
-            ("features", torch.Size([3, 224, 224])),
-            ("segmentation", torch.Size([1, 224, 224])),
-            ("disparity", torch.Size([1, 224, 224])),
-        ],
-        [
-            ("features", torch.Size([3, 224, 224])),
-            ("pointcloud", torch.Size([1000, 3])),
-        ],
-        [
-            ("features", torch.Size([3, 224, 224])),
-            ("pointcloud", torch.Size([1000, 3])),
-            ("foobar", torch.Size([2, 3, 4, 5, 6])),
-        ],
-    ],
-)
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_collate_fn(input_names_and_shapes, batch_size):
-    # Mock batch data
-
-    def build_batch_element():
-        inputs = {}
-        for name, shape in input_names_and_shapes:
-            inputs[name] = torch.rand(shape, dtype=torch.float32)
-
-        labels = {
-            "classification": (
-                torch.randint(0, 2, (2,), dtype=torch.int64),
-                LabelType.CLASSIFICATION,
-            )
-        }
-
-        return inputs, labels
-
-    batch = [build_batch_element() for _ in range(batch_size)]
-
-    # Call collate_fn
-    inputs, annotations = collate_fn(batch)  # type: ignore
-
-    # Check images tensor
-    assert inputs["features"].shape == (batch_size, 3, 224, 224)
-    assert inputs["features"].dtype == torch.float32
-
-    # Check annotations
-    assert "classification" in annotations
-    assert annotations["classification"][0].shape == (batch_size, 2)
-    assert annotations["classification"][0].dtype == torch.int64
-
-
-# TODO: test also segmentation, boundingbox and keypoint
-
-
-if __name__ == "__main__":
-    pytest.main()