diff --git a/.github/workflows/cli_vamb.yml b/.github/workflows/cli_vamb.yml
index 1f74ae2a..9018429d 100644
--- a/.github/workflows/cli_vamb.yml
+++ b/.github/workflows/cli_vamb.yml
@@ -28,7 +28,7 @@ jobs:
         cache-dependency-path: '**/pyproject.toml'
     - name: Download fixtures
       run: |
-        wget https://www.dropbox.com/scl/fi/xzc0tro7oe6tfm3igygpj/ci_data.zip\?rlkey\=xuv6b5eoynfryp4fba1kfp5jm\&st\=9oz24ych\&dl\=0 -O ci_data.zip
+        wget https://www.dropbox.com/scl/fi/xzc0tro7oe6tfm3igygpj/ci_data.zip\?rlkey\=xuv6b5eoynfryp4fba1kfp5jm\&st\=rjb1xccw\&dl\=0 -O ci_data.zip
         unzip -o ci_data.zip
     - name: Install dependencies
       run: |
@@ -61,6 +61,6 @@ jobs:
         cat outdir_taxometer/log.txt
     - name: Run k-means reclustering
       run: |
-        vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --hmmout_path markers_mock.hmmout --algorithm kmeans --minfasta 200000
+        vamb recluster --outdir outdir_recluster --fasta catalogue_mock.fna.gz --abundance abundance_mock.npz --latent_path outdir_taxvamb/vaevae_latent.npz --clusters_path outdir_taxvamb/vaevae_clusters_split.tsv --markers markers_mock.npz --algorithm kmeans --minfasta 200000
         ls -la outdir_recluster
         cat outdir_recluster/log.txt
diff --git a/pyproject.toml b/pyproject.toml
index 65452776..82983def 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "pycoverm == 0.6.0",
     "networkx == 3.2", # 3.3 drops Python 3.9 support
     "scikit-learn == 1.5.0",
-    "pandas == 2.2.0",
     "dadaptation == 3.2",
     "loguru == 0.7.2",
     "pyhmmer == 0.10.12",
diff --git a/test/data/marker.fna b/test/data/marker.fna
new file mode 100644
index 00000000..e929c753
--- /dev/null
+++ b/test/data/marker.fna
@@ -0,0 +1,2 @@
+>abc
+GTGGTGAGTGCTTGGACGTGTGTGGTGATGCTCATGGTGTCTTCCCCCCTTCCCCGTGTTGTTACCACCCACTCTACCCGCTCACACTTTCGAATACAAGGGTATTTTTCGAAATGGAAAAGCTGTCGAGCACTCACATGTTCGACAGCTTCATATGTCCTTGAACTACGTAACTTTTTCCTTACAATCAGCGCAAAGTCCAAAGATTTCGGCCTCGTGACTACTGAGAGCAAAGCCGTTTTTAGTGGCAATTTCCTGTGCCCATGTCTCTACTGGACCGCCATCGATTTCGACTGTGTGACCGCAATTGGTGCAGACCAAGTGATGGTGGTGTCCCTCGTCGTGGCATTGGCGGTACAGAGTTTCTCCACCTGTGACGGTAAGTACGTCGACTGCCCTGATGTCCGCGAGGGATTGGAGGGTTCGGTAGACGGTTGTGAGGCCGACGTTGTGTTCCCTGGTGGATAGCTCGTGATGGATTTCTTTGGCGGAAGCGAAGTTATCGATTTCCTCAAGAACGTCAATTACGGCTTTTCGCTGTCTGGTGCTTCGCACTCCCAGCTTCGGGGCAGAGCCTTGGCTGATGCGATTGATACCCACCGTTGATCCTCCTCAATGACACAAAATGTACTTCGATAGTCTACCCAGATGTGTCAACCCCTGCGTTTAGTGCCAGGAGAAGTATGTCGAGGACGAGTGGTTCGGCAAGTGAATAAGTCATTTGCCGGCCTTGACGTTCTGCGTCGACGATACCTGCAGTTTTAAGGACTTTGAGGTGTTGGCTCACTAGTGGTTGCGAACTTTTTACTAGCTTGACCAATTCGTGGACGTAGTGGGGCCTTTCGTTGAGGGCGAGGATGATTTCGATTCTTAAGGGGGAATCTAGTGCCCTAATCAGCAGGCTGATCGCTTTGATGTTTTTTGCTGTTGCAAGTTTCTGAAGCTCAGCTGATGCTGTGGATTCGGACTCTTCTGCAGGGGTGACGAAATTCCGATTTGAGTGTTGAGCCACGGGGAAGTCCTTCCGTCCTTAGGCTAGGTCTGGAATGGATCTAGCACGCTTGCTATTTTACCTTCTATATAAACCTTTTATGAGGGAAATGAAAAAATAGTTATTAGAACTAGTTTACATCGCGAAGGCCGCAAAATGACGGGGTCAGCGGAAGCAACATCGTTAGTTGGGCTAGGATTGGTTGGGTATGTCCTAAAAGGGACGGTTATTTTTTCATTCGACGTGGAGGAGAGCATCCGACGTGGCTCAGCAATCGATCATCGACACCGTGGTTAACCTGTGTAAACGACGTGGACTGGTGTACCCCTGTGGTGAGATCTACGGCGGTACCCGCTCTGCGTGGGACTACGGCCCGCTGGGTGTGGAGCTGAAGGAAAACATCAAGCGCCAGTGGTGGCGTTCTATGGTTACTTCCCGCCCAGATGTTGTGGGTGTTGATACTTCTGTCATCCTTCCTCGCCAGGTGTGGGTAACTTCCGGCCACGTTGAGGTCTTCACTGACCCACTGGTTGAGTCTTTGAACACCCACAAGCGTTACCGTGCGGACCACCTGCTGGAGCAGTACGAAGAGAAGCATGGTCACCCACCTGTAAACGGCTTGGCTGACATCAACGATCCAGAGACCGGCCAGCCAGGTAACTGGACTGAGCCTAAGGCGTTCTCTGGTCTTCTGAAGACTTTCTTGGGACCTGTGGACGACGAAGAGGGTCTGCACTACCTGCGCCCTGAAACTGCTCAGGGTATCTTCGTGAACTTCAAGAACGTGATGAACACTTCACGCATGAAGCCACCTTTCGGTATCGCGAACATCGGTAAGTCTTTCCGTAACGAGATCACCCCAGGTAACTTCATTTTCCGTACTCGTGAGTTCGAGCAGATGGAGATGGAGTTCTTCGTCAAGCCTGGTGAGGACGAAGAGTGGCACCAGCACTGGATTGATACTCGCCTGCAGTGGTACATCAACCTGGGCATTAAGCCTGAGAACCTGCGTCTGTACGAGCACCCTCAGGAGAAGCTGTCTCACTACTCCAAGCGCACTGTTGATATTGAGTACGCATTCAACTTTGCTAACACCAAGTGGGGCGAGTTAGAGGGTATCGCGAACCGTACTGATTACGATCTTCGCGTGCACTCTGAGGGCTCTGGTGAGGACCTGTCATTCTTCGATCAGGAGACCAATGAGCGTTGGATTCCTTTCGTAATCGAGCCTGCTGCAGGTCTTGGTCGCGCAATGATGATGTTCCTGATGGATGCTTATCACGAGGACGAGGCACCAAACTCAAAGGGTGGCGTCGATAAGCGTGTTGTTCTGAAGCTTGACCGTCGCCTTGCGCCGGTTAAGGTTGCGGTCTTGCCGCTGTCAAAGAAGGACACTTTGACGCCTTTGGCGGAAAAGCTCGCAGCAGAGCTGCGTGAATTCTGGAACGTTGATTACGACACTTCAGGTGCGATTGGTCGCCGTTACCGTCGTCAGGACGAGATCGGTACTCCATTCTGCGTCACCGTTGACTTTGATTCTCTCGAGGACAACGCTGTGACCGTGCGTGAGCGCGACACCATGGAGCAGGTTCGTGTTCCACTTGATGAGCTGCAGGGTTACTTGGCTCAGCGCCTCATCGGCTGCTAAACGGCAACCAATAGAGCGATAATTCGCTAAGACGAATGTAATCGCAGCAACATATAGCACCGGCTTAACAGGCCGGTGCTATTCTGTTCGCATGACTTCGAAGGATCTGATTGTGACCTCCTATACGTCTTGGGGCAAGCGTTTCAAGAATGACGGGAAGCTTTTTATTAACCTACTTCGCAGCACCACTGATAGTGCTGATGAAAAGGTTTTAGCCACTTTCGGTGAAGTTCCCAGCAAATCATTTGAAACCACCGCAACGGTTGATGAGCAGCAGTGGGAACTGTCCTTCAATATTGATGGAACGGCAACTGCCAAGCTTCCTGATGGTCGTGTGTTCAGCGCGAATGCAGGTGAGAAGACCTTTACCAAGTCCAAGCGGATTGAAATCGACATGGACGGCACCGCGATGGCTGCTGTTAATGAAGATAAAAACAATTGGATTATCGACGATTCTGAAGAGAATAAAGTCGCTCAGTTTACCGGTATGAACAACGGTGTGCGTCGCGCGATTGTGGAGTTTGAGCCTGACGTAGAAGTCACCCAGGAGCAGGAAATTTTCTTGTCGTGGGTTGCTCGGAAAACTCTGGAATCCCGCATGTTGGGCTCCAGTTGGGGACTGACTCTGTTTTTGATCATTTTGACGCCAATCATTATTTTTCTCACTTTCAGCTAAAAGGACCATGCAATGGTAGACGCTCAGCGCCCCAAAGCAGGCATCTTCGGTAGCCACACAGAAGAAACATGGGTGTGGCTCGGTAATGAACTTTTCGACGAGTCCGGCGAGGTCATCGCCGACGTTCGCTCCGACGTCCTCTACGTGGATCGCGAACGACTACTCATCGAATCCACCCCCGGCACCATGCGTTTTCGTTGCCGCGCAACACTGTCCGGGGGTGAGGTCTATACAATGACTCAGAATTCTTTCACTGTGGGGGATCTCACTGCGGTGTGCGGGCGCCGGACGTATTCGCTAAAAAGGGTGTCGCCGTGGCGTAAAGAACGCCTGATCACCAACAATGGGGTGGAAGTGGCGCGACTTCGCCCGATGACCAGCGGTAAAGTCGAATTCATTGTGGGCACCGCGGACAGCGAGGCGTTGCCGTTCGTCGACGCAGTATTTTTGAGCTGGGCGTGCGTCCTGGTGGATTCGGCCGTGCGCCGGCCGAAAATTTAAAAGCTTTTTGCTTATCGACGCACCCCTCCACCTGTTTTTTGTAGCCGGGGGATCATTTCCTTTGAAGGATCCAATCTCCGCACTTAGTTTCCTTCGGTGTGAAGGAAAGAGTTCCGTAAAGACCTCTATCTCATTTAAAGAAGTGGAGGATTAGGGTCGTTGACTCGCCTTCGGCACTAATTTGAGCCAAGTTCAAGTTTGCTGCCATCCCAGGTGACCGAAAATGTCCTATGCGAGGTCTCTTCGGTCACTTGGTTTTGCTCGTTTCAGGCTAGAAGCGGCCTCCGCGGAACCCTCCTCCGCCACCGCCACCACCGCTGAAGCCGCCACCGCCTCCACCGAAGCCTCCACCGAAACCGCCACCGCGGCCGCTGTTGAGAATCGAGTTGATCACCATGCCGGTGACAATCGCACCGGTGGTTCCGCCACCGGAATTGTGGCGATTGTTGTAGGTGGTGATGTCGTTTTGTGCTGACTTGCTGGCGCGTTGGGCTGCGACTGCTGCTTGACGTCCGTAATCAATTCCTGCACGGGTGTCGCGGGTGCGGTTTTGTTGTGCCATGGCGTACAGTTTTTGTGCGTTGGCCAGGTGGGTGCGGGCTTCGGATTTTACGATGCGACCGCGGGTGGAGATGAGGTCTTCGGCCTTTTGGATTTGGCTTCTTGCAGATTGCAGCTGTTGGTCGAATACGCGTAGCTGGCGGGCTTGATCAGCTGCGGTGGCGCGAAGTGTGTCAAGTTGAGTGTCGAGGGCGGAGTCGACATCGACAAGTTCTGTGTAGGTTCCGAGCGGATCCTTTTCGGCGTCTGCTGATGCGGTGGTTAGTGCTGCGCTGGCTGCGCGGACAGCATCGTCGAGGGAGGCCCAGTCGGCACGGGCACCGTCGGCTCCTGCGCTTTGTTTGAGTTGGCCGGCTTCGTTGATTTCGTCTGAGATTTCTTGAATCAGATCGGCAACGTTTGCTTTGGCTGTGGAGATGTTTTCATCGGCATGCTCGACGCCCTCGAGGAGTTTGTCTGCGGTAGTGATGGCGTGCTCGATGTGACGGATCGCGTCGATAAGCCCGCCCTGCTCGCCTGCGGGCATGGACTCTATCTTGTACGCCTGTGGCAGGACTTCTTCTGCTTCGTCGAGCGAAGCGCTGGCGAGGTCGACGTTGTCGTCGATGCTTTCAAGGACCTCTGCTGAGTAGCGAGCGCGCAGGCCAGCGAGTGTTTCTTGAGCCTTGGGGAGGCGGGTGCGCAGGTCGACGGATTTTTGGGTGAGAGCATCCAATTTGCTGCCCGCGTTGATCAGCAGGTTGCGCATATCGGCAAAGTTTTGGGCCTCGGCGTCGAGGGCATCGTCGGCTTGGCCACAGGATGAAATGATTTCTACCAGCATGGATCGACGTTCGGCTTCGGATTCTGGGATAGAATCGTTGAGGCGCTGCTGAATCTCAAAGGCTTTTTGCAGGGTGCCGGTGGAGTGGTTCATGGCGCGGTTGAAGCTGCGGGTGCGCTCTGGTCCGAACTCGGAGGTAGCGATAGCGAGCTCTTCTTTTCCGCGACGGATGGAGTCATCAGTGGAGGTGAGCTCTTCTTGGGCAAGGTGTTCGAGAGTTTCCATGGGAAGCTGCATGAGGCGGTTGGTATCGCGAGGGTCGATCTCACGTGCATCTTCCAAGGTTGCAGCACTTGTTTTCTTCTTGCGGCTGCGGGAATAGGCCCAAATTCCGCCACCAGCGGCCACTGTGCCAACGCCCGCAGCAGCCAACCAAGCGCCGGAAGATCCAGAAGAGCCTGAGGTTCCTGAGGCACCAGAACTAGAACCAACTGATTCTGCCAGCGCTAGTGCGGAGCCTGCCCAATCTTCTTGGGAAAGCGCCTGGAAAGCAGCGTTGTTGGCGGCGTCGAGTTCAGCGTCGGTCCATTGAGTACCACCTTGGATGCCGTACTGCCGTTCCTCGGGAGCGAGTGCATAAACCAAGACGTTTCCGCCGCCGTTGGCTTGGAGTGCTTGCTGCGTCCACGTTTCAGGGTCAACTCCGTCGAAAGAGCTTAGGAAAACAACGAAAATAACCTTTTGTTCAGATGCCTTTACATCATCGATGGCAGCCTGAATGTTGGTGATATCGGACGAGGAAATCTGGCCGGTGTAGTCAGTGACATTGTCTTGGTAAAATTCTGGTGATTCAGCCAAGACATATGTTTCTGTGGCTTCTGCAGTGTGAGCAGTAAAAAATGGTCCACTGATAAGGAGCGCGCCAGCTCCAATTGCCACAGTGACCGATACACGGCGGACGTTTTCCCGAAGATGCACCAAACTAAAGTTCATGGTCCCCACCTTAGACGAGTCCAGCTGGCACACTAGTTAACGTGAGAAGATTTTTAGCCAAGAGTTTACTCTTAACCGCAGTAGCGCAACCAGCCCTGAGGGTGGTCGCGTATTCGATGCTCAGAACGCCTAATAATCGGCACAAAATTGATTCAATTTTGGTGTTGGGCACAGCTCAATATGATGGGGTTCCATCGAGGCAGTTTGCTGCTCGTTTGAGGCATGCCGCGAAGCTGTGGCGTCTTCATGAAATCCAGCATGTATATACTGTCGGCGGAAAACTTCCTGGTGATCGTTTCACCGAAGCAGAAGTCGCGCGGGAGTATTTGATCAAAGAGGGCGTGGATCCGGATCTGATTTTTGTCTCTGCAGTTGGCAATGACACTGTCTCCTCCTATGAGGCGCTTGATCCGGAAAAGCTTGGTCGGGTGCTGATTGTTACTGATCCGAACCATTCGTATCGGGCGGTGCGCATCGCGCGACGCATGGGCTTTGACGCGAAACCTTCCCCGACAACCTATAGTCCCGCGAAGTTTCCGTCGATAGTTTATTTTCTGACCTTGTCCCATGAGTGGGGCGGGGTAGTGGTACAGGACGTGTCGTGGCTCTTGGGCGAACGGGTGGCCGATAAGGTGGAAGCATCTTTGCGAACTATCCAAGGCCTGCTGCGCCCTTCGAGGCGTGCGCGCCATGAGCAACTTCGGAGGCTGAAAAAGTAGATGTACCCCTATTCCGACGCAGACGCTTTTCGACGCCACCCTGAGCGCGCCAAGTCCAGCCAACTGCGCACCAGCGCCGTAGACACCCGCAGCGCGTTCGCCCGCGACCGGGCTCGCGTGCTGCATTCTGCTGCTCTTCGACGCCTCGCGGATAAAACCCAAGTGGTTGGCCCCAATGATGGTGATACTCCGCGCACCCGGCTGACGCACTCTTTGGAAGTAGCTCAAATTGCACGGGGAATCGGAGCTGGACTGGATTTGGATCCTGATCTGTGCGATCTGGCAGGGCTGTGCCATGACATTGGGCATCCGCCGTATGGACACAACGGTGAAAACGCGTTGAATGAAGTTGCTGCGGCCTGTGGAGGATTTGAGGGCAACGCCCAAACCTTGCGTATTCTCACGCGGCTGGAGCCAAAAATTGTCTCTGATGAGGGGGAGAGCTTTGGGCTGAACTTGTCGCGGGCTGCTCTTGATGCTGCATGTAAGTATCCGTGGGCTAAAACAAATGCGGATGGCAGTGTCAATAAGAAATACAGTGCTTATGATGAGGACGCAGAAATTCTTGCTTGGATCAGGCAAGGCCATGAAGACCTCAGACCACCAATCGAAGCTCAGGTCATGGACTTTTCCGATGATATTGCCTACTCAGTACACGATGTAGAAGACGGCATTGTTTCCGGTCGCATCGATTTGAAAGTGCTGTGGGACCTGGTGGAATTAGCAGCACTGGCGGACAAAGGCGCAGCAGCTTTCGGAGGCTCGCCTGCAGAACTCATCGAGGGCGCAGCCTCGTTGCGGGAGCTTCCTGTGGTAGCGGCCGCTGCAGATTTTGATTTCTCACTGCGTTCCTACGCTGCGCTGAAGGCCATGACCTCAGAACTAGTGGGAAGATACGTTGGCTCTACCATCGAGTCAACAAAGAAAACACACGCCGGCATTGATGTGGGACGCATGCACGGCGATTTGATCATTCCAGAAACAGCGGCCAGTGAAGTAAAACTGCTCAAAACGTTAGCGGTTCTCTATGTGATGGATGACCCAGGGCACCTTGCGCGCCAAAACAGGCAACGGGATCGTATCTTCCGGGTTTTTGACTACCTGGTGCTGGGGGCTCCGGGATCGTTGGATCCGATGTATCGCCAGTGGTTTATTGAAGCGGATTCAGAATCGGAACAGATCCGTGTGATTGTTGATCAGATTGCGTCGATGACGGAGTCTCGTCTGGAACGCCTTGCCCGGAATGCTGCTGACATCTCAGGATTCTTGGGATAGTTGGTTAGAGCAGCAGCGATTTTTAGTAAGGCCAATAACATGTTTTGGCTTAAACCTGTGTCGTGTCAGATGGTGGCGAAGTAGAGTTCGCAAAGCTAGCGAACATGAATTCGTGTTCAGGAACTTAACAGGGATCAAACAGAGAACAGAGAACAGATCACGCTGCCCAAAAATCGCACTTTTAAGGTTTGTGGGCGTCTGTGTGTGGTTTGCCGCTGTAAAGTATCACCACGTTATGCGCCCTGGTGTGATCAAGCGTTCGTTCTGGGTCGAAACCCCAAAAGTCACAATTCCCCAGAAGCGGGTCAAACCCATTTAGCTTATTGCTTACATATCGAGGGTTTAGAAAAGTGATTTGTCGGATCAGTCGGTTTCTGCCAAGTAAATAGAACTTTATAAATTTTGTGGCTCTCAAATCTTAGGCCACGGCTTCCGATTTGAACCGGAGGTTCAAAAGGCTTATATAGACAAGATTCTGCATCGTCTCACGAGCCCCTCATTGCCTGACACGGTCAATCGTGTGGGAGGTACCAATCCGTGAGATTTCTGCCAACGAGCGATTCATTGGCCCCGCTGCAGAGCTGGCAGAACACGGACATAACCCAAATAATCTGAGGTCTGCCGTTTGCAGCAGCATTAGCGTTTGATGTGGAAGGTGATGCAGAGGCTGTTGATCTGCAAGCGCGTCTTTCCCAAGCACGGGGGAACCCTGAAGCATCGGATGCTCTAGTTGCTGAGCTGACTGGTGTTACTGCTAATCATCCGTTGGTCAGTGCTTGTCTGAAGTTTCCGCTCAATCCTAAGCTTCTCAAGATTTCGTAAAAAAGCTGCCAACTACCGTAAAACCGCACTACTAGAGGAGTGCGTTTTTCGTTCCTGAACACATTGCGTGCTGCAACTTAATTATGGTCCTCCCAGCTCAGTGTGCTGTGTGGATTGTTTATTCTCGTCCATTAAGTGATCGAGAAAAAGTTGTTGTAAAGTCATGCGCATGTGTGGAATTGTTGGATATATTGGCCAGGCGGGCGACTCCCGTGATTACTTTGCGCTTGACGTCGTTTTAGAAGGACTGCGCCGACTTGAATACCGCGGTTATGATTCCGCAGGTGTAGCTGTTCATGCGAACGGTGAAATCAGCTACCGAAAGAAGGCTGGAAAGGTAGCTGCGCTGGACGCTGAGATCGCTCGCGCTCCTTTGGCGGATTCCATTTTGGCTATTGGTCACACCCGGTGGGCAACTCACGGTGGACCAACCGATGCAAATGCACACCCCCATGTTGTTGATGGCGGCAAGTTAGCTGTCGTACACAACGGTATTATTGAAAACTTTGCAGAGCTGCGCGCAGAGCTTTCAGCTAAGGGCTACAACTTTGTTTCCGTTACTGACACTGAAGTTGCCGCCACATTGCTGGCAGAAATCTACAACACCCAGGCTAATGGCGATCTGACCAAGGCTATGCAGCTTACTGGTCAGCGTCTTGAGGGTGCGTTCACCCTGCTGGCTATCCATGCTGATCATGATGATCGTATTGTTGCAGCGCGCCGTAACTCTCCTTTGGTTATTGGCTTGGGTGAAGGCGAAAACTTCCTCGGCTCTGACGTTTCTGGCTTCATCGATTACACCCGCAAGGCTGTTGAGATGGGCAACGATCAGATTGTGACCATCACTGCGAACGACTACCAGATCACCAACTTCGATGGTTCTGAGGCAACCGGAAAACCTTTCGACGTGGAGTGGGATGCGGCTGCTGCTGAAAAGGGTGGCTTTGATTCCTTCATGGATAAGGAAATCCACGACCAGCCAGCTGCAGTGCGTGACACCCTCCTCGGACGTTTAGATGAGGATGGCAAGCTGGTCCTTGATGAGCTTCGT
diff --git a/test/test_parsemarkers.py b/test/test_parsemarkers.py
new file mode 100644
index 00000000..7dace124
--- /dev/null
+++ b/test/test_parsemarkers.py
@@ -0,0 +1,37 @@
+import unittest
+import vamb
+import testtools
+from pathlib import Path
+import tempfile
+import shutil
+import io
+
+
+class TestParseMarkers(unittest.TestCase):
+    def test_instantiate(self):
+        tmp = tempfile.mkdtemp()
+        tmp_path = Path(tmp)
+        shutil.rmtree(tmp)
+        markers = vamb.parsemarkers.Markers.from_files(
+            Path(testtools.DATADIR).joinpath("marker.fna"),
+            Path(testtools.PARENTDIR).joinpath("vamb").joinpath("marker.hmm"),
+            ["abc"],
+            tmp_path,
+            4,
+            None,
+        )
+        self.assertIsNotNone(markers.markers[0])
+        self.assertEqual(len(markers.markers), 1)
+        self.assertEqual(set(markers.markers[0]), {39})
+        self.assertEqual(
+            markers.refhash, vamb.vambtools.RefHasher.hash_refnames(["abc"])
+        )
+
+        buf = io.StringIO()
+        markers.save(buf)
+        buf.seek(0)
+
+        markers2 = vamb.parsemarkers.Markers.load(buf, markers.refhash)
+        self.assertEqual(len(markers.markers), len(markers2.markers))
+        self.assertEqual(set(markers.markers[0]), set(markers2.markers[0]))
+        self.assertEqual(markers.marker_names, markers2.marker_names)
diff --git a/test/test_reclustering.py b/test/test_reclustering.py
index 0118f74b..6c250039 100644
--- a/test/test_reclustering.py
+++ b/test/test_reclustering.py
@@ -1,8 +1,5 @@
 import unittest
 
-# Invariants:
-# It produces disjoint clusters, a subset of the input points
-
 # For CAMI dataset, compute comp, abundance, taxonomy, markers
 # Subset to e.g. 5 genera plus a few unclassified contigs
 
@@ -22,3 +19,8 @@ class TestKmeansReclustering(unittest.TestCase):
     # Create latent
 
     # Initial clustering
+
+
+class TestDBScanReclustering(unittest.TestCase):
+    # It produces disjoint clusters, a subset of the input points
+    pass
diff --git a/test/test_semisupervised_encode.py b/test/test_semisupervised_encode.py
index 27369341..6472e3ab 100644
--- a/test/test_semisupervised_encode.py
+++ b/test/test_semisupervised_encode.py
@@ -31,14 +31,14 @@ def make_random_annotation(self):
         phylum = np.random.choice(self.phyla, 1)[0]
         clas = np.random.choice(self.classes[phylum], 1)[0]
         if np.random.random() <= 0.2:
-            return vamb.vambtools.ContigTaxonomy.from_semicolon_sep(
+            return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
                 ";".join([self.domain])
             )
         if 0.2 < np.random.random() <= 0.5:
-            return vamb.vambtools.ContigTaxonomy.from_semicolon_sep(
+            return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
                 ";".join([self.domain, phylum])
             )
-        return vamb.vambtools.ContigTaxonomy.from_semicolon_sep(
+        return vamb.taxonomy.ContigTaxonomy.from_semicolon_sep(
             ";".join([self.domain, phylum, clas])
         )
 
diff --git a/vamb/__init__.py b/vamb/__init__.py
index 9981660c..10dce8c5 100644
--- a/vamb/__init__.py
+++ b/vamb/__init__.py
@@ -16,6 +16,7 @@
 from . import parsebam
 from . import parsecontigs
 from . import parsemarkers
+from . import taxonomy
 from . import cluster
 from . import encode
 from . import aamb_encode
@@ -35,6 +36,7 @@
     "parsebam",
     "parsecontigs",
     "parsemarkers",
+    "taxonomy",
     "cluster",
     "encode",
     "aamb_encode",
diff --git a/vamb/__main__.py b/vamb/__main__.py
index d05863d2..b5f73fbd 100755
--- a/vamb/__main__.py
+++ b/vamb/__main__.py
@@ -15,7 +15,6 @@
 from typing import Optional, Tuple, Union, cast, Callable, Literal, NamedTuple
 from pathlib import Path
 from collections.abc import Sequence
-from collections import defaultdict
 from torch.utils.data import DataLoader
 from functools import partial
 from loguru import logger
@@ -377,11 +376,18 @@ def __init__(
 class TaxonomyPath:
     @classmethod
     def from_args(cls, args: argparse.Namespace):
+        if args.taxonomy is None:
+            raise argparse.ArgumentTypeError(
+                "Cannot load taxonomy without specifying --taxonomy"
+            )
         return cls(typeasserted(args.taxonomy, Path))
 
     def __init__(self, path: Path):
         self.path = check_existing_file(path)
 
+    def get_tax_path(self) -> Path:
+        return self.path
+
 
 class TaxometerOptions:
     @classmethod
@@ -450,6 +456,13 @@ def from_args(cls, args: argparse.Namespace):
     def __init__(self, path_or_tax_options: Union[TaxonomyPath, TaxometerOptions]):
         self.path_or_tax_options = path_or_tax_options
 
+    def get_tax_path(self) -> Path:
+        p = self.path_or_tax_options
+        if isinstance(p, TaxonomyPath):
+            return p.get_tax_path()
+        else:
+            return p.taxonomy.get_tax_path()
+
 
 class MarkerPath:
     def __init__(self, path: Path):
@@ -940,6 +953,41 @@ def load_composition_and_abundance(
     return (composition, abundance)
 
 
+def load_markers(
+    options: MarkerOptions,
+    comp_metadata: vamb.parsecontigs.CompositionMetaData,
+    existing_outdir: Path,
+    n_threads: int,
+) -> vamb.parsemarkers.Markers:
+    logger.info("Loading markers")
+    begin_time = time.time()
+    if isinstance(options.content, MarkerPath):
+        path = options.content.path
+        logger.info(f'\tLoading markers from existing `markers.npz` at "{path}"')
+        markers = vamb.parsemarkers.Markers.load(path, comp_metadata.refhash)
+    elif isinstance(options.content, PredictMarkers):
+        logger.info("\tPredicting markers. This might take some time")
+        logger.info(f"\t\tFASTA file located at {options.content.fasta.path}")
+        logger.info(
+            f"\t\tHMM profile file (.hmm file) located at {options.content.hmm}"
+        )
+        markers = vamb.parsemarkers.Markers.from_files(
+            options.content.fasta.path,
+            options.content.hmm,
+            cast(Sequence[str], comp_metadata.identifiers),
+            existing_outdir.joinpath("tmp_markers"),
+            n_threads,
+            comp_metadata.refhash,
+        )
+        markers.save(existing_outdir.joinpath("markers.npz"))
+    else:
+        assert False
+
+    elapsed = round(time.time() - begin_time, 2)
+    logger.info(f"\tProcessed markers in {elapsed} seconds.\n")
+    return markers
+
+
 def trainvae(
     vae_options: VAEOptions,
     vamb_options: GeneralOptions,
@@ -1267,26 +1315,26 @@ def run_bin_aae(opt: BinAvambOptions):
 
 
 def predict_taxonomy(
+    comp_metadata: vamb.parsecontigs.CompositionMetaData,
     abundance: np.ndarray,
     tnfs: np.ndarray,
     lengths: np.ndarray,
-    contignames: np.ndarray,
-    taxonomy_path: Path,
     out_dir: Path,
     taxonomy_options: TaxometerOptions,
     cuda: bool,
-):
+) -> vamb.taxonomy.PredictedTaxonomy:
     begintime = time.time()
+    logger.info("Predicting taxonomy with Taxometer")
 
-    taxonomies = vamb.vambtools.parse_taxonomy(
-        input_file=taxonomy_path,
-        contignames=cast(Sequence[str], contignames),
-        is_canonical=False,
+    taxonomies = vamb.taxonomy.Taxonomy.from_file(
+        taxonomy_options.taxonomy.path, comp_metadata, False
+    )
+    nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(
+        taxonomies.contig_taxonomies
     )
-    nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(taxonomies)
-    logger.info(f"{len(nodes)} nodes in the graph")
+    logger.info(f"\t{len(nodes)} nodes in the graph")
     classes_order: list[str] = []
-    for i in taxonomies:
+    for i in taxonomies.contig_taxonomies:
         if i is None or len(i.ranks) == 0:
             classes_order.append("root")
         else:
@@ -1342,36 +1390,31 @@ def predict_taxonomy(
     nodes_ar = np.array(nodes)
 
     logger.info(f"Using threshold {taxonomy_options.softmax_threshold}")
-    row = 0
-    output_exists = False
+    contig_taxonomies: list[vamb.taxonomy.PredictedContigTaxonomy] = []
     for predicted_vector, predicted_labels in model.predict(dataloader_vamb):
-        N = len(predicted_vector)
-        predictions: list[str] = []
-        probs: list[str] = []
         for i in range(predicted_vector.shape[0]):
             label = predicted_labels[i]
             while table_parent[label] != -1:
                 label = table_parent[label]
-            threshold_mask = predicted_vector[i] > taxonomy_options.softmax_threshold
-            pred_line = ";".join(nodes_ar[threshold_mask][1:])
-            predictions.append(pred_line)
-            absolute_probs = predicted_vector[i][threshold_mask]
-            absolute_prob = ";".join(map(str, absolute_probs[1:]))
-            probs.append(absolute_prob)
-
-        with open(predicted_path, "a") as file:
-            if not output_exists:
-                print("contigs\tpredictions\tlengths\tscores", file=file)
-            for contig, length, prediction, prob in zip(
-                contignames[row : row + N], predictions, lengths[row : row + N], probs
-            ):
-                print(contig, length, prediction, prob, file=file, sep="\t")
-        output_exists = True
-        row += N
+            threshold_mask: Sequence[bool] = (
+                predicted_vector[i] > taxonomy_options.softmax_threshold
+            )
+            ranks = list(nodes_ar[threshold_mask][1:])
+            probs = predicted_vector[i][threshold_mask][1:]
+            tax = vamb.taxonomy.PredictedContigTaxonomy(
+                vamb.taxonomy.ContigTaxonomy(ranks), probs
+            )
+            contig_taxonomies.append(tax)
+
+    taxonomy = vamb.taxonomy.PredictedTaxonomy(contig_taxonomies, comp_metadata, False)
+
+    with open(predicted_path, "w") as file:
+        taxonomy.write_as_tsv(file, comp_metadata)
 
     logger.info(
         f"Completed taxonomy predictions in {round(time.time() - begintime, 2)} seconds."
     )
+    return taxonomy
 
 
 def run_taxonomy_predictor(opt: TaxometerOptions):
@@ -1381,18 +1424,17 @@ def run_taxonomy_predictor(opt: TaxometerOptions):
         opt.abundance,
         vamb.vambtools.BinSplitter.inert_splitter(),
     )
-    abundance, tnfs, lengths, contignames = (
+    abundance, tnfs, lengths, _ = (
         abundance.matrix,
         composition.matrix,
         composition.metadata.lengths,
         composition.metadata.identifiers,
     )
     predict_taxonomy(
+        comp_metadata=composition.metadata,
         abundance=abundance,
         tnfs=tnfs,
         lengths=lengths,
-        contignames=contignames,
-        taxonomy_path=opt.taxonomy.path,
         out_dir=opt.general.out_dir,
         taxonomy_options=opt,
         cuda=opt.general.cuda,
@@ -1415,32 +1457,29 @@ def run_vaevae(opt: BinTaxVambOptions):
     )
     if isinstance(opt.taxonomy.path_or_tax_options, TaxometerOptions):
         logger.info("Predicting missing values from taxonomy")
-        predict_taxonomy(
+        predicted_contig_taxonomies = predict_taxonomy(
+            comp_metadata=composition.metadata,
             abundance=abundance,
             tnfs=tnfs,
             lengths=lengths,
-            contignames=contignames,
-            taxonomy_path=opt.taxonomy.path_or_tax_options.taxonomy.path,
             out_dir=opt.common.general.out_dir,
             taxonomy_options=opt.taxonomy.path_or_tax_options,
             cuda=opt.common.general.cuda,
         )
-        contig_taxonomies = vamb.vambtools.parse_taxonomy(
-            input_file=opt.common.general.out_dir.joinpath("results_taxometer.tsv"),
-            contignames=cast(Sequence[str], contignames),
-            is_canonical=False,
-        )
+        contig_taxonomies = predicted_contig_taxonomies.to_taxonomy()
     else:
         logger.info("Not predicting the taxonomy")
-        contig_taxonomies = vamb.vambtools.parse_taxonomy(
-            input_file=opt.taxonomy.path_or_tax_options.path,
-            contignames=cast(Sequence[str], contignames),
-            is_canonical=False,
+        contig_taxonomies = vamb.taxonomy.Taxonomy.from_file(
+            opt.taxonomy.path_or_tax_options.path,
+            composition.metadata,
+            False,
         )
 
-    nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(contig_taxonomies)
+    nodes, ind_nodes, table_parent = vamb.taxvamb_encode.make_graph(
+        contig_taxonomies.contig_taxonomies
+    )
     classes_order: list[str] = []
-    for i in contig_taxonomies:
+    for i in contig_taxonomies.contig_taxonomies:
         if i is None or len(i.ranks) == 0:
             classes_order.append("root")
         else:
@@ -1529,6 +1568,9 @@ def run_vaevae(opt: BinTaxVambOptions):
     )
 
 
+# TODO: The whole data flow around predict_taxonomy needs to change.
+# Ideally, we should have a "get taxonomy" function that loads, possibly refines,
+# and then returns the taxonomy object.
 def run_reclustering(opt: ReclusteringOptions):
     composition = calc_tnf(
         opt.composition,
@@ -1536,35 +1578,81 @@ def run_reclustering(opt: ReclusteringOptions):
         opt.general.out_dir,
         opt.output.binsplitter,
     )
-    logger.info(f"{len(composition.metadata.identifiers)} contig names after filtering")
-    logger.info("Taxonomy predictions are provided")
-    markers = opt.markers.content
-    assert isinstance(markers, PredictMarkers)
-    reclustered = vamb.reclustering.recluster_bins(
-        None if isinstance(opt.algorithm, DBScanOptions) else opt.algorithm.clusters,
-        opt.latent_path,
-        str(opt.composition.path.path),
-        composition.metadata.identifiers,
-        out_dir=opt.general.out_dir,
-        minfasta=0,
-        binned_length=1000,
-        num_process=40,
-        random_seed=opt.general.seed,
-        hmmout_path=markers.hmm,
-        algorithm="kmeans" if isinstance(opt.algorithm, KmeansOptions) else "dbscan",
-        predictions_path=None
-        if isinstance(opt.algorithm, KmeansOptions)
-        else (
-            opt.algorithm.taxonomy_options.path_or_tax_options.path
-            if isinstance(
-                opt.algorithm.taxonomy_options.path_or_tax_options, TaxonomyPath
+    markers = load_markers(
+        opt.markers, composition.metadata, opt.general.out_dir, opt.general.n_threads
+    )
+    latent = vamb.vambtools.read_npz(opt.latent_path)
+    alg = opt.algorithm
+
+    if isinstance(alg, DBScanOptions):
+        # If we should refine or not.
+        if isinstance(alg.taxonomy_options.path_or_tax_options, TaxometerOptions):
+            taxopt = alg.taxonomy_options.path_or_tax_options
+            abundance = calc_abundance(
+                taxopt.abundance,
+                taxopt.general.out_dir,
+                taxopt.general.refcheck,
+                composition.metadata,
+                taxopt.general.n_threads,
             )
-            else opt.algorithm.taxonomy_options.path_or_tax_options.taxonomy.path
-        ),
-    )
-    cluster_dict: defaultdict[str, set[str]] = defaultdict(set)
-    for k, v in zip(reclustered, composition.metadata.identifiers):
-        cluster_dict[cast(str, k)].add(v)
+            predicted_tax = predict_taxonomy(
+                composition.metadata,
+                abundance.matrix,
+                composition.matrix,
+                composition.metadata.lengths,
+                taxopt.general.out_dir,
+                taxopt,
+                taxopt.general.cuda,
+            )
+            taxonomy = predicted_tax.to_taxonomy()
+        else:
+            tax_path = alg.taxonomy_options.path_or_tax_options.path
+            logger.info(f'Loading taxonomy from file "{tax_path}"')
+            taxonomy = vamb.taxonomy.Taxonomy.from_file(
+                tax_path, composition.metadata, True
+            )
+        instantiated_alg = vamb.reclustering.DBScanAlgorithm(
+            composition.metadata, taxonomy, opt.general.n_threads
+        )
+        logger.info("Reclustering")
+        logger.info("\tAlgorithm: DBSCAN")
+        reclustered_contigs = vamb.reclustering.recluster_bins(
+            markers, latent, instantiated_alg
+        )
+    else:
+        with open(alg.clusters) as file:
+            clusters = vamb.vambtools.read_clusters(file)
+        contig_to_id: dict[str, int] = {
+            c: i for (i, c) in enumerate(composition.metadata.identifiers)
+        }
+        clusters_as_ids: list[set[vamb.reclustering.ContigId]] = []
+        for cluster in clusters.values():
+            s = set()
+            for contig in cluster:
+                i = contig_to_id.get(contig, None)
+                if i is None:
+                    raise ValueError(
+                        f'Contig "{contig}" found in the provided clusters file is not found in the '
+                        "provided composition."
+                    )
+                s.add(vamb.reclustering.ContigId(i))
+            clusters_as_ids.append(s)
+        instantiated_alg = vamb.reclustering.KmeansAlgorithm(
+            clusters_as_ids,
+            abs(opt.general.seed) % 4294967295,  # Note: sklearn seeds must be uint32
+            composition.metadata.lengths,
+        )
+        logger.info("Reclustering")
+        logger.info("\tAlgorithm: KMeans")
+        reclustered_contigs = vamb.reclustering.recluster_bins(
+            markers, latent, instantiated_alg
+        )
+
+    logger.info("\tReclustering complete")
+    identifiers = composition.metadata.identifiers
+    clusters_dict: dict[str, set[str]] = dict()
+    for i, cluster in enumerate(reclustered_contigs):
+        clusters_dict[str(i)] = {identifiers[c] for c in cluster}
 
     if opt.output.min_fasta_output_size is None:
         fasta_output = None
@@ -1580,7 +1668,7 @@ def run_reclustering(opt: ReclusteringOptions):
         fasta_output,
         opt.output.binsplitter,
         str(opt.general.out_dir.joinpath("clusters_reclustered")),
-        cluster_dict,
+        clusters_dict,
         cast(Sequence[str], composition.metadata.identifiers),
         cast(Sequence[int], composition.metadata.lengths),
     )
@@ -1857,6 +1945,15 @@ def add_predictor_arguments(subparser):
         default=1024,
         help=argparse.SUPPRESS,
     )
+    pred_trainos.add_argument(
+        "-pq",
+        dest="pred_batchsteps",
+        metavar="",
+        type=int,
+        nargs="*",
+        default=[25, 75],
+        help=argparse.SUPPRESS,
+    )
     pred_trainos.add_argument(
         "-pthr",
         dest="pred_softmax_threshold",
@@ -2164,6 +2261,7 @@ def main():
     add_marker_arguments(recluster_parser)
     add_bin_output_arguments(recluster_parser)
     add_reclustering_arguments(recluster_parser)
+    add_predictor_arguments(recluster_parser)
     add_taxonomy_arguments(recluster_parser)
 
     args = parser.parse_args()
diff --git a/vamb/parsemarkers.py b/vamb/parsemarkers.py
index d7210c9e..7d149a2c 100644
--- a/vamb/parsemarkers.py
+++ b/vamb/parsemarkers.py
@@ -18,8 +18,9 @@
 from collections import defaultdict
 import json
 import numpy as np
+from loguru import logger
 
-MarkerID = NewType("Marker", int)
+MarkerID = NewType("MarkerID", int)
 MarkerName = NewType("MarkerName", str)
 ContigID = NewType("ContigID", int)
 ContigName = NewType("ContigName", str)
@@ -123,9 +124,9 @@ def from_files(
         cls,
         contigs: Path,
         hmm_path: Path,
+        contignames: Sequence[str],
         tmpdir_to_create: Path,
         n_processes: int,
-        fasta_entry_mask: Sequence[bool],
         target_refhash: Optional[bytes],
     ):
         """
@@ -141,37 +142,16 @@ def from_files(
         If the target refhash is not None, and the computed reference hash does not
         match, an exception is thrown. See vamb.vambtools.RefHasher.
         """
-        if n_processes < 1:
-            raise ValueError(f"Must use at least 1 process, not {n_processes}")
-        # Cap processes, because most OSs cap the number of open file handles,
-        # and we need one file per process when splitting FASTA file
-        elif n_processes > 64:
-            print(f"Warning: Processes set to {n_processes}, capping to 64")
-            n_processes = 64
-        name_to_id: dict[MarkerName, MarkerID] = dict()
-
-        # Create the list of marker names, translating those that are the same,
-        # but appear under two marker names
+        n_processes = cap_processes(n_processes)
         with open(hmm_path, "rb") as file:
-            for hmm in pyhmmer.plan7.HMMFile(file):
-                name = hmm.name.decode()
-                if name in NORMALIZE_MARKER_TRANS_DICT:
-                    continue
-                name_to_id[MarkerName(name)] = MarkerID(len(name_to_id))
-        for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items():
-            name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)]
-        id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list)
-        for name, id in name_to_id.items():
-            id_to_names[id].append(name)
-        marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))]
-
-        # For safety: Verify that there are no more than 256 MarkerIDs, such that we can
-        # store them in an uint8 array without overflow (which does not throw errors
-        # in the current version of Numpy)
-        assert len(marker_names) <= 256
+            hmms = list(pyhmmer.plan7.HMMFile(file))
+        (_, marker_names) = get_name_to_id(hmms)
 
         (refhash, paths) = split_file(
-            contigs, tmpdir_to_create, n_processes, fasta_entry_mask
+            contigs,
+            contignames,
+            tmpdir_to_create,
+            n_processes,
         )
 
         if target_refhash is not None:
@@ -179,16 +159,17 @@ def from_files(
                 refhash, target_refhash, "Markers FASTA file", None, None
             )
 
-        marker_list: list[Optional[np.ndarray]] = [None] * sum(fasta_entry_mask)
+        index_of_name = {
+            ContigName(n): ContigID(i) for (i, n) in enumerate(contignames)
+        }
+        marker_list: list[Optional[np.ndarray]] = [None] * len(contignames)
         with Pool(n_processes) as pool:
             for sub_result in pool.imap_unordered(
                 work_per_process,
-                list(
-                    zip(paths, itertools.repeat(hmm_path), itertools.repeat(name_to_id))
-                ),
+                list(zip(paths, itertools.repeat(hmms))),
             ):
-                for contig_id, markers in sub_result:
-                    marker_list[contig_id] = markers
+                for contig_name, markers in sub_result:
+                    marker_list[index_of_name[contig_name]] = markers
 
         shutil.rmtree(tmpdir_to_create)
         markers = cls(marker_list, marker_names, refhash)
@@ -196,6 +177,17 @@ def from_files(
         return markers
 
 
+def cap_processes(processes: int) -> int:
+    if processes < 1:
+        raise ValueError(f"Must use at least 1 process, not {processes}")
+    # Cap processes, because most OSs cap the number of open file handles,
+    # and we need one file per process when splitting FASTA file
+    elif processes > 64:
+        logger.warning(f"Processes set to {processes}, capping to 64")
+        return 64
+    return processes
+
+
 # Some markers have different names, but should be treated as the same SCG.
 NORMALIZE_MARKER_TRANS_DICT = {
     "TIGR00388": "TIGR00389",
@@ -205,31 +197,25 @@ def from_files(
 }
 
 
-def filter_contigs(reader: Reader, mask: Sequence[bool]) -> Iterable[FastaEntry]:
-    for record, keep in itertools.zip_longest(byte_iterfasta(reader), mask):
-        if record is None or keep is None:
-            raise ValueError(
-                "The mask length does not match the length of FASTA records"
-            )
-        if keep:
-            yield record
-
-
 def split_file(
-    input: Path, tmpdir_to_create: Path, n_splits: int, mask: Sequence[bool]
+    input: Path,
+    contignames: Sequence[str],
+    tmpdir_to_create: Path,
+    n_splits: int,
 ) -> tuple[bytes, list[Path]]:
+    names = set(contignames)
     os.mkdir(tmpdir_to_create)
     paths = [tmpdir_to_create.joinpath(str(i)) for i in range(n_splits)]
     filehandles = [open(path, "w") for path in paths]
     refhasher = RefHasher()
     with Reader(input) as infile:
-        records = filter_contigs(infile, mask)
-        # We write the index to the record and store that instead of the name.
         for i, (outfile, record) in enumerate(
-            zip(itertools.cycle(filehandles), records)
+            zip(
+                itertools.cycle(filehandles),
+                filter(lambda x: x.identifier in names, byte_iterfasta(infile)),
+            )
         ):
             refhasher.add_refname(record.identifier)
-            record.identifier = str(i)
             print(record.format(), file=outfile)
 
     for filehandle in filehandles:
@@ -243,12 +229,12 @@ def process_chunk(
     hmms: list[pyhmmer.plan7.HMM],
     name_to_id: dict[MarkerName, MarkerID],
     finder: pyrodigal.GeneFinder,
-) -> list[tuple[ContigID, np.ndarray]]:
+) -> list[tuple[ContigName, np.ndarray]]:
     # We temporarily store them as sets in order to deduplicate. While single contigs
     # may have duplicate markers, it makes no sense to count this as contamination,
     # because we are not about to second-guess the assembler's job of avoiding
     # chimeric sequences.
-    markers: defaultdict[ContigID, set[MarkerID]] = defaultdict(set)
+    markers: defaultdict[ContigName, set[MarkerID]] = defaultdict(set)
     alphabet = pyhmmer.easel.Alphabet.amino()
     digitized: list[pyhmmer.easel.DigitalSequence] = []
     for record in chunk:
@@ -267,27 +253,24 @@ def process_chunk(
         assert score_cutoff is not None
         for hit in top_hits:
             if hit.score >= score_cutoff:
-                markers[ContigID(int(hit.name.decode()))].add(marker_id)
+                markers[ContigName(hit.name.decode())].add(marker_id)
 
     return [
         (name, np.array(list(ids), dtype=np.uint8)) for (name, ids) in markers.items()
     ]
 
 
-# We avoid moving data between processes as much as possible, so each process
-# only needs these two paths, and this marker name dict which we assume to be small.
-# The return type here is optimised for a small memory footprint.
 def work_per_process(
-    args: tuple[Path, Path, dict[MarkerName, MarkerID]],
-) -> list[tuple[ContigID, np.ndarray]]:
-    (contig_path, hmmpath, name_to_id) = args
-    with open(hmmpath, "rb") as file:
-        hmms = list(pyhmmer.plan7.HMMFile(file))
+    args: tuple[Path, list[pyhmmer.plan7.HMM]],
+) -> list[tuple[ContigName, np.ndarray]]:
+    (contig_path, hmms) = args
+
+    (name_to_id, _) = get_name_to_id(hmms)
 
     # Chunk up the FASTA file for memory efficiency reasons, while still
     # allowing pyhmmer to scan multiple sequences at once for speed
     chunk: list[FastaEntry] = []
-    result: list[tuple[ContigID, np.ndarray]] = []
+    result: list[tuple[ContigName, np.ndarray]] = []
     finder = pyrodigal.GeneFinder(meta=True)
     with open(contig_path, "rb") as file:
         for record in byte_iterfasta(file):
@@ -298,3 +281,26 @@ def work_per_process(
         result.extend(process_chunk(chunk, hmms, name_to_id, finder))
 
     return result
+
+
+def get_name_to_id(
+    hmms: list[pyhmmer.plan7.HMM],
+) -> tuple[dict[MarkerName, MarkerID], list[list[MarkerName]]]:
+    name_to_id: dict[MarkerName, MarkerID] = dict()
+    for hmm in hmms:
+        name = hmm.name.decode()
+        if name in NORMALIZE_MARKER_TRANS_DICT:
+            continue
+        name_to_id[MarkerName(name)] = MarkerID(len(name_to_id))
+    for old_name, new_name in NORMALIZE_MARKER_TRANS_DICT.items():
+        name_to_id[MarkerName(old_name)] = name_to_id[MarkerName(new_name)]
+
+    if len(set(name_to_id.values())) > 256:
+        raise ValueError("Maximum 256 marker IDs")
+
+    id_to_names: defaultdict[MarkerID, list[MarkerName]] = defaultdict(list)
+    for n, i in name_to_id.items():
+        id_to_names[i].append(n)
+    marker_names = [id_to_names[MarkerID(i)] for i in range(len(id_to_names))]
+
+    return name_to_id, marker_names
diff --git a/vamb/reclustering.py b/vamb/reclustering.py
index 019acbc1..0c4cfcc8 100644
--- a/vamb/reclustering.py
+++ b/vamb/reclustering.py
@@ -1,563 +1,342 @@
 """
-The following code is a modification of a k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin
+The following code is based on the k-means based reclustering algorithm first published at https://github.com/BigDataBiology/SemiBin
 The original code is distributed under MIT License.
 """
 
-import os
-import subprocess
-import sys
-import contextlib
 from sklearn.cluster import KMeans
 import numpy as np
 from collections import defaultdict
-import pandas as pd
 from sklearn.cluster import DBSCAN
 from sklearn.metrics import pairwise_distances
-import gzip
-import lzma
-from loguru import logger
-import vamb
-
-
-def get_best_bin(
-    results_dict, contignames, contig_to_marker, namelist, contig_dict, minfasta
-):
-    # There is room for improving the loop below to avoid repeated computation
-    # but it runs very fast in any case
-    for max_contamination in [0.1, 0.2, 0.3, 0.4, 0.5, 1]:
-        max_F1 = 0
-        weight_of_max = 1e9
-        max_bin = None
-        for res_labels in results_dict.values():
-            res = defaultdict(list)
-            for label, name in zip(res_labels, namelist):
-                if label != -1:
-                    res[label].append(name)
-            for bin_contig in res.values():
-                cur_weight = sum(len(contig_dict[contig]) for contig in bin_contig)
-                if cur_weight < minfasta:
-                    continue
-                marker_list = []
-                for contig in bin_contig:
-                    marker_list.extend(contig_to_marker[contig])
-                if len(marker_list) == 0:
-                    continue
-                recall = len(set(marker_list)) / 104
-                contamination = (len(marker_list) - len(set(marker_list))) / len(
-                    marker_list
-                )
-                if contamination <= max_contamination:
-                    F1 = (
-                        2
-                        * recall
-                        * (1 - contamination)
-                        / (recall + (1 - contamination))
-                    )
-                    if F1 > max_F1:
-                        max_F1 = F1
-                        weight_of_max = cur_weight
-                        max_bin = bin_contig
-                    elif F1 == max_F1 and cur_weight <= weight_of_max:
-                        weight_of_max = cur_weight
-                        max_bin = bin_contig
-        if max_F1 > 0:  # if there is a bin with F1 > 0
-            return max_bin
-
-
-def fasta_iter(fname, full_header=False):
-    """Iterate over a (possibly gzipped) FASTA file
-
-    Parameters
-    ----------
-    fname : str
-        Filename.
-            If it ends with .gz, gzip format is assumed
-            If .bz2 then bzip2 format is assumed
-            if .xz, then lzma format is assumerd
-    full_header : boolean (optional)
-        If True, yields the full header. Otherwise (the default), only the
-        first word
-
-    Yields
-    ------
-    (h,seq): tuple of (str, str)
-    """
-    header = None
-    chunks = []
-    if hasattr(fname, "readline"):
-
-        def op(f, _):
-            return f
-
-    elif fname.endswith(".gz"):
-        op = gzip.open
-    elif fname.endswith(".bz2"):
-        import bz2
-
-        op = bz2.open
-    elif fname.endswith(".xz"):
-        op = lzma.open
-    else:
-        op = open
-    with op(fname, "rt") as f:
-        for line in f:
-            if line[0] == ">":
-                if header is not None:
-                    yield header, "".join(chunks)
-                line = line[1:].strip()
-                if not line:
-                    header = ""
-                elif full_header:
-                    header = line.strip()
-                else:
-                    header = line.split()[0]
-                chunks = []
-            else:
-                chunks.append(line.strip())
-        if header is not None:
-            yield header, "".join(chunks)
-
-
-normalize_marker_trans__dict = {
-    "TIGR00388": "TIGR00389",
-    "TIGR00471": "TIGR00472",
-    "TIGR00408": "TIGR00409",
-    "TIGR02386": "TIGR02387",
-}
-
-
-def replace_bin_names(data, clusters_labels_map):
-    """
-    The 'orf' column is formatted as 'bin.contig_id'
-    but there is no need to search again if only the bins are different but the contigs are the same.
-    This function replaces bin names in the 'orf' column
-    with the new bin names from another cluster file given the same contigs.
-
-    TODO: refactor the whole file so there is no need for that
-
-    Input:
-    - data: pd.DataFrame
-    - clusters_labels_map: dict, {contig: bin}
-
-    Returns a copy of the dataframe with the updated 'orf' column
-    """
-    data = data.copy()
-    data["contig_number"] = data["orf"].str.split(pat=".", n=1, expand=True)[1]
-    data["contig_only"] = data["contig_number"].map(lambda x: x.rsplit("_", 1)[0])
-    data["old_bin"] = data["orf"].str.split(pat=".", n=0, expand=True)[0]
-    data["new_bin"] = data["contig_only"].map(
-        lambda x: f"bin{clusters_labels_map[x]:06}"
-    )
-    data["orf"] = data[["new_bin", "contig_number"]].agg(".".join, axis=1)
-    return data
-
-
-def get_marker(
-    hmmout,
-    fasta_path=None,
-    min_contig_len=None,
-    multi_mode=False,
-    orf_finder=None,
-    contig_to_marker=False,
-    clusters_dict=None,
-):
-    """Parse HMM output file and return markers"""
-    data = pd.read_table(
-        hmmout,
-        sep=r"\s+",
-        comment="#",
-        header=None,
-        usecols=(0, 3, 5, 15, 16),
-        names=["orf", "gene", "qlen", "qstart", "qend"],
-    )
-    if clusters_dict is not None:
-        data = replace_bin_names(data, clusters_dict)
-    if not len(data):
-        return []
-    data["gene"] = data["gene"].map(lambda m: normalize_marker_trans__dict.get(m, m))
-    qlen = data[["gene", "qlen"]].drop_duplicates().set_index("gene")["qlen"]
-
-    def contig_name(ell):
-        if orf_finder in ["prodigal", "fast-naive"]:
-            contig, _ = ell.rsplit("_", 1)
-        else:
-            contig, _, _, _ = ell.rsplit("_", 3)
-        return contig
-
-    data = data.query("(qend - qstart) / qlen > 0.4").copy()
-    data["contig"] = data["orf"].map(contig_name)
-    if min_contig_len is not None:
-        contig_len = {h.split(".", 1)[1]: len(seq) for h, seq in fasta_iter(fasta_path)}
-        data = data[
-            data["contig"].map(
-                lambda c: contig_len[c.split(".", 1)[1]] >= min_contig_len
+from vamb.taxonomy import Taxonomy
+from vamb.parsemarkers import Markers, MarkerID
+from vamb.parsecontigs import CompositionMetaData
+from vamb.vambtools import RefHasher
+from collections.abc import Sequence, Iterable
+from typing import NewType, Optional, Union
+
+# We use these aliases to be able to work with integers, which is faster.
+ContigId = NewType("ContigId", int)
+BinId = NewType("BinId", int)
+
+# TODO: We might want to benchmark the best value for this constant.
+# Right now, we do too much duplicated work by clustering 18 times.
+EPS_VALUES = np.arange(0.01, 0.35, 0.02)
+
+
+class KmeansAlgorithm:
+    "Arguments needed specifically when using the KMeans algorithm"
+
+    def __init__(
+        self, clusters: list[set[ContigId]], random_seed: int, contiglengths: np.ndarray
+    ):
+        assert np.issubdtype(contiglengths.dtype, np.integer)
+        self.contiglengths = contiglengths
+        self.clusters = clusters
+        self.random_seed = random_seed
+
+
+class DBScanAlgorithm:
+    "Arguments needed specifically when using the DBScan algorithm"
+
+    def __init__(
+        self, comp_metadata: CompositionMetaData, taxonomy: Taxonomy, n_processes: int
+    ):
+        if not taxonomy.is_canonical:
+            raise ValueError(
+                "Can only run DBScan on a Taxonomy object with is_canonical set"
             )
-        ]
-    data = data.drop_duplicates(["gene", "contig"])
-
-    if contig_to_marker:
-        from collections import defaultdict
-
-        marker = data["gene"].values
-        contig = data["contig"].str.lsplit(".").str[-1].values
-        sequence2markers = defaultdict(list)
-        for m, c in zip(marker, contig):
-            sequence2markers[c].append(m)
-        return sequence2markers
-    else:
-
-        def extract_seeds(vs, sel):
-            vs = vs.sort_values()
-            median = vs[len(vs) // 2]
-
-            # the original version broke ties by picking the shortest query, so we
-            # replicate that here:
-            candidates = vs.index[vs == median]
-            c = qlen.loc[candidates].idxmin()
-            r = list(sel[sel["gene"] == c]["contig"])
-            r.sort()
-            return r
-
-        if multi_mode:
-            data["bin"] = data["orf"].str.split(pat=".", n=0, expand=True)[0]
-            counts = data.groupby(["bin", "gene"])["orf"].count()
-            res = {}
-            for b, vs in counts.groupby(level=0):
-                cs = extract_seeds(
-                    vs.droplevel(0), data.query("bin == @b", local_dict={"b": b})
-                )
-                res[b] = [c.split(".", 1)[1] for c in cs]
-            return res
-        else:
-            counts = data.groupby("gene")["orf"].count()
-            return extract_seeds(counts, data)
-
-
-def run_prodigal(fasta_path, num_process, output):
-    contigs = {}
-    for h, seq in fasta_iter(fasta_path):
-        contigs[h] = seq
-
-    total_len = sum(len(s) for s in contigs.values())
-    split_len = total_len // num_process
-
-    cur = split_len + 1
-    next_ix = 0
-    out = None
-    with contextlib.ExitStack() as stack:
-        for h, seq in contigs.items():
-            if cur > split_len and next_ix < num_process:
-                if out is not None:
-                    out.close()
-                out = open(os.path.join(output, "contig_{}.fa".format(next_ix)), "wt")
-                out = stack.enter_context(out)
-
-                cur = 0
-                next_ix += 1
-            out.write(f">{h}\n{seq}\n")
-            cur += len(seq)
-
-    try:
-        process = []
-        for index in range(next_ix):
-            with open(
-                os.path.join(output, f"contig_{index}_log.txt") + ".out", "w"
-            ) as prodigal_out_log:
-                p = subprocess.Popen(
-                    [
-                        "prodigal",
-                        "-i",
-                        os.path.join(output, f"contig_{index}.fa"),
-                        "-p",
-                        "meta",
-                        "-q",
-                        "-m",  # See https://github.com/BigDataBiology/SemiBin/issues/87
-                        "-a",
-                        os.path.join(output, f"contig_{index}.faa"),
-                    ],
-                    stdout=prodigal_out_log,
-                )
-                process.append(p)
-
-        for p in process:
-            p.wait()
-
-    except:
-        sys.stderr.write("Error: Running prodigal fail\n")
-        sys.exit(1)
-
-    contig_output = os.path.join(output, "contigs.faa")
-    with open(contig_output, "w") as f:
-        for index in range(next_ix):
-            f.write(
-                open(os.path.join(output, "contig_{}.faa".format(index)), "r").read()
-            )
-    return contig_output
-
-
-def cal_num_bins(
-    fasta_path,
-    binned_length,
-    num_process,
-    output,
-    markers_path=None,
-    clusters_dict=None,
-    multi_mode=False,
-    orf_finder="prodigal",
-):
-    """Estimate number of bins from a FASTA file
-
-    Parameters
-    fasta_path: path
-    binned_length: int (minimal contig length)
-    num_process: int (number of CPUs to use)
-    multi_mode: bool, optional (if True, treat input as resulting from concatenating multiple files)
-    """
-    if markers_path is None:
-        markers_path = os.path.join(output, "markers.hmmout")
-    if os.path.exists(markers_path):
-        return get_marker(
-            markers_path,
-            fasta_path,
-            binned_length,
-            multi_mode,
-            orf_finder=orf_finder,
-            clusters_dict=clusters_dict,
+        RefHasher.verify_refhash(
+            taxonomy.refhash,
+            comp_metadata.refhash,
+            "taxonomy",
+            "composition",
+            None,
         )
-    else:
-        os.makedirs(output, exist_ok=True)
-        target_dir = output
-
-    contig_output = run_prodigal(fasta_path, num_process, output)
-
-    hmm_output = os.path.join(target_dir, "markers.hmmout")
-    try:
-        with open(os.path.join(output, "markers.hmmout.out"), "w") as hmm_out_log:
-            subprocess.check_call(
-                [
-                    "hmmsearch",
-                    "--domtblout",
-                    hmm_output,
-                    "--cut_tc",
-                    "--cpu",
-                    str(num_process),
-                    os.path.split(__file__)[0] + "/marker.hmm",
-                    contig_output,
-                ],
-                stdout=hmm_out_log,
-            )
-    except:
-        if os.path.exists(hmm_output):
-            os.remove(hmm_output)
-        sys.stderr.write("Error: Running hmmsearch fail\n")
-        sys.exit(1)
-
-    return get_marker(
-        hmm_output, fasta_path, binned_length, multi_mode, orf_finder=orf_finder
-    )
+        self.contiglengths = comp_metadata.lengths
+        self.taxonomy = taxonomy
+        self.n_processes = n_processes
 
 
 def recluster_bins(
-    clusters_path,
-    latents_path,
-    contigs_path,
-    contignames_all,
-    minfasta,
-    binned_length,
-    num_process,
-    out_dir,
-    random_seed,
-    algorithm,
-    hmmout_path=None,
-    predictions_path=None,
-):
-    contig_dict = {h: seq for h, seq in fasta_iter(contigs_path)}
-    embedding = np.load(latents_path)
-    if "arr_0" in embedding:
-        embedding = embedding["arr_0"]
-    if (algorithm == "dbscan") and predictions_path:
-        df_gt = pd.read_csv(predictions_path)
-        column = "predictions"
-        df_gt["genus"] = df_gt[column].str.split(";").str[5].fillna("mock")
-        species = list(set(df_gt["genus"]))
-        species_ind = {s: i for i, s in enumerate(species)}
-        clusters_labels_map = {
-            k: species_ind[v] for k, v in zip(df_gt["contigs"], df_gt["genus"])
-        }
-    else:
-        with open(clusters_path) as file:
-            clusters = vamb.vambtools.read_clusters(file)
-        index_of_cluster = {c: i for (i, c) in enumerate(clusters.keys())}
-        clusters_labels_map: dict[str, int] = dict()
-        for cluster, contigs in clusters.items():
-            for contig in contigs:
-                clusters_labels_map[contig] = index_of_cluster[cluster]
-    labels_cluster_map = defaultdict(list)
-    for k, v in clusters_labels_map.items():
-        labels_cluster_map[v].append(k)
-    indices_contigs = {c: i for i, c in enumerate(contignames_all)}
-    logger.info(f"\tLatent shape {embedding.shape}")
-    logger.info(f"\tN contignames for the latent space {len(contignames_all)}")
-    logger.info(f"\tN contigs in fasta files {len(contig_dict)}")
-    logger.info(f"\tN contigs in the cluster file {len(clusters_labels_map)}")
-
-    contig_labels = np.array([clusters_labels_map[c] for c in contignames_all])
-
-    assert len(contig_labels) == embedding.shape[0]
-
-    total_size = defaultdict(int)
-    for i, c in enumerate(contig_labels):
-        total_size[c] += len(contig_dict[contignames_all[i]])
-
-    cfasta = os.path.join(out_dir, "concatenated.fna")
-    with open(cfasta, "wt") as concat_out:
-        for ix, h in enumerate(contignames_all):
-            bin_ix = contig_labels[ix]
-            if total_size[bin_ix] < minfasta:
-                continue
-            concat_out.write(f">bin{bin_ix:06}.{h}\n")
-            concat_out.write(contig_dict[contignames_all[ix]] + "\n")
-
-    logger.info("\tStarting searching for markers")
-
-    if hmmout_path is not None:
-        logger.info(f"\t\thmmout file is provided at {hmmout_path}")
-        seeds = cal_num_bins(
-            cfasta,
-            binned_length,
-            num_process,
-            output=out_dir,
-            markers_path=hmmout_path,
-            clusters_dict=clusters_labels_map,
-            multi_mode=True,
+    markers: Markers,
+    latent: np.ndarray,
+    algorithm: Union[KmeansAlgorithm, DBScanAlgorithm],
+) -> list[set[ContigId]]:
+    assert np.issubdtype(algorithm.contiglengths.dtype, np.integer)
+    assert np.issubdtype(latent.dtype, np.floating)
+
+    if not (len(algorithm.contiglengths) == markers.n_seqs == len(latent)):
+        raise ValueError(
+            "Number of elements in contiglengths, markers and latent must match"
         )
-    else:
-        seeds = cal_num_bins(
-            cfasta,
-            binned_length,
-            num_process,
-            output=out_dir,
-            multi_mode=True,
+
+    # Simply dispatch to the right implementation based on the algorithm used
+    if isinstance(algorithm, KmeansAlgorithm):
+        return recluster_kmeans(
+            algorithm.clusters,
+            latent,
+            algorithm.contiglengths,
+            markers,
+            algorithm.random_seed,
         )
-    # we cannot bypass the orf_finder here, because of the renaming of the contigs
-    if seeds == []:
-        logger.info("\tNo bins found in the concatenated fasta file.")
-        return contig_labels
-    logger.info("\tFinished searching for markers")
-    logger.info(f"\tFound {len(seeds)} seeds")
-
-    if algorithm == "dbscan":
-        contig2marker = get_marker(
-            hmmout_path,
-            min_contig_len=binned_length,
-            fasta_path=cfasta,
-            orf_finder="prodigal",
-            clusters_dict=clusters_labels_map,
-            contig_to_marker=True,
+    elif isinstance(algorithm, DBScanAlgorithm):
+        assert len(algorithm.taxonomy.contig_taxonomies) == markers.n_seqs
+        return recluster_dbscan(
+            algorithm.taxonomy,
+            latent,
+            algorithm.contiglengths,
+            markers,
+            algorithm.n_processes,
         )
-        logger.info("\tRunning DBSCAN with cosine distance")
-        extracted_all = []
-        for k, v in labels_cluster_map.items():
-            logger.info(f"\tLabel {k}, {len(v)} contigs")
-            indices = [indices_contigs[c] for c in v]
-            contignames = contignames_all[indices]
-            embedding_new = embedding[indices]
-            DBSCAN_results_dict = {}
-            distance_matrix = pairwise_distances(
-                embedding_new, embedding_new, metric="cosine"
-            )
-            length_weight = np.array([len(contig_dict[name]) for name in contignames])
-            for eps_value in np.arange(0.01, 0.35, 0.02):
-                dbscan = DBSCAN(
-                    eps=eps_value,
-                    min_samples=5,
-                    n_jobs=num_process,
-                    metric="precomputed",
-                )
-                dbscan.fit(distance_matrix, sample_weight=length_weight)
-                labels = dbscan.labels_
-                logger.info(f"\t\tepsilon {eps_value}, {len(set(labels))} labels")
-                DBSCAN_results_dict[eps_value] = labels.tolist()
-
-            logger.info("\tIntegrating results")
-
-            extracted = []
-            contignames_list = list(contignames)
-            while (
-                sum(len(contig_dict[contig]) for contig in contignames_list) >= minfasta
-            ):
-                if len(contignames_list) == 1:
-                    extracted.append(contignames_list)
-                    break
-
-                max_bin = get_best_bin(
-                    DBSCAN_results_dict,
-                    contig2marker,
-                    contignames_list,
-                    contig_dict,
-                    minfasta,
-                )
-                if not max_bin:
-                    break
-
-                extracted.append(max_bin)
-                for temp in max_bin:
-                    temp_index = contignames_list.index(temp)
-                    contignames_list.pop(temp_index)
-                    for eps_value in DBSCAN_results_dict:
-                        DBSCAN_results_dict[eps_value].pop(temp_index)
-            extracted_all.extend(extracted)
-
-        contig2ix = {}
-        logger.info(f"\t{len(extracted_all)} extracted clusters")
-        for i, cs in enumerate(extracted_all):
-            for c in cs:
-                contig2ix[c] = i
-        namelist = contignames_all.copy()
-        contig_labels_reclustered = [contig2ix.get(c, -1) for c in namelist]
-    elif algorithm == "kmeans":
-        name2ix = {name: ix for ix, name in enumerate(contignames_all)}
-        contig_labels_reclustered = np.empty_like(contig_labels)
-        contig_labels_reclustered.fill(-1)
-        next_label = 0
-        for bin_ix in range(contig_labels.max() + 1):
-            seed = seeds.get(f"bin{bin_ix:06}", [])
-            num_bin = len(seed)
-
-            if num_bin > 1 and total_size[bin_ix] >= minfasta:
-                contig_indices = [
-                    i for i, ell in enumerate(contig_labels) if ell == bin_ix
-                ]
-                re_bin_features = embedding[contig_indices]
-
-                seed_index = [name2ix[s] for s in seed]
-                length_weight = np.array(
-                    [
-                        len(contig_dict[name])
-                        for name, ell in zip(contignames_all, contig_labels)
-                        if ell == bin_ix
-                    ]
-                )
-                seeds_embedding = embedding[seed_index]
-                logger.info(f"\t\tStarting K-means reclutering for bin {bin_ix}")
-                kmeans = KMeans(
-                    n_clusters=num_bin,
-                    init=seeds_embedding,
-                    n_init=1,
-                    random_state=random_seed,
-                )
-                kmeans.fit(re_bin_features, sample_weight=length_weight)
-                logger.info("\t\tFinished K-means reclutering")
-                for i, label in enumerate(kmeans.labels_):
-                    contig_labels_reclustered[contig_indices[i]] = next_label + label
-                next_label += num_bin
-            else:
-                contig_labels_reclustered[contig_labels == bin_ix] = next_label
-                next_label += 1
-        assert contig_labels_reclustered.min() >= 0
-        os.remove(cfasta)
-    else:
-        raise AssertionError(
-            "Reclustering algorithm must be one of ['dbscan', 'kmeans']"
+
+
+def recluster_kmeans(
+    clusters: list[set[ContigId]],
+    latent: np.ndarray,
+    contiglengths: np.ndarray,
+    markers: Markers,
+    random_seed: int,
+) -> list[set[ContigId]]:
+    assert len(latent) == len(contiglengths) == markers.n_seqs
+    assert np.issubdtype(contiglengths.dtype, np.integer)
+    assert np.issubdtype(latent.dtype, np.floating)
+    assert latent.ndim == 2
+
+    result: list[set[ContigId]] = []
+    indices_by_medoid: dict[int, set[ContigId]] = defaultdict(set)
+    # We loop over all existing clusters, and determine if they should be split,
+    # by looking at the median number of single-copy genes in the cluster
+    for cluster in clusters:
+        # All clusters with 1 contig by definition cannot have multiple single-copy
+        # genes (because SCGs are deduplicated within a single contig)
+        if len(cluster) == 1:
+            result.append(cluster)
+            continue
+        # Get a count of each marker and compute the median count of SCGs
+        counts = count_markers(cluster, markers)
+        cp = counts.copy()
+        cp.sort()
+        median_counts: int = cp[len(cp) // 2]
+        # If we have less than 2 SCGs on average, the cluster should not be split,
+        # and we emit it unchanged
+        if median_counts < 2:
+            result.append(cluster)
+            continue
+
+        # Run K-means with the median number of SCGs to split the contig.
+        # We weigh the contigs by length.
+        seeds = get_kmeans_seeds(
+            cluster,
+            markers,
+            contiglengths,  # type: ignore
+            counts,
+            median_counts,
+        )
+
+        cluster_indices = np.array(list(cluster))
+        cluter_latent = latent[cluster_indices]
+        cluster_lengths = contiglengths[cluster_indices]
+        seed_latent = latent[seeds]
+        kmeans = KMeans(
+            n_clusters=median_counts,
+            init=seed_latent,
+            n_init=1,
+            random_state=random_seed,
+        )
+        kmeans.fit(cluter_latent, sample_weight=cluster_lengths)
+        indices_by_medoid.clear()
+        for cluster_label, index in zip(kmeans.labels_, cluster_indices):
+            indices_by_medoid[cluster_label].add(ContigId(index))
+        result.extend(indices_by_medoid.values())
+
+    return result
+
+
+# Get a vector of counts, of each SCG,
+# where if MarkerID(5) is seen 9 times, then counts[5] == 9.
+def count_markers(
+    contigs: Iterable[ContigId],
+    markers: Markers,
+) -> np.ndarray:
+    counts = np.zeros(markers.n_markers, dtype=np.int32)
+    for contig in contigs:
+        m = markers.markers[contig]
+        if m is not None:
+            counts[m] += 1
+    return counts
+
+
+# Same as above, but once we see a very high number of marker genes,
+# we bail. This is because a large fraction of time spent in this module
+# would otherwise be counting markers of huge clusters, long after we already
+# know it's hopelessly contaminated
+def count_markers_saturated(
+    contigs: Iterable[ContigId],
+    markers: Markers,
+) -> Optional[np.ndarray]:
+    counts = np.zeros(markers.n_markers, dtype=np.int32)
+    n_markers = 0
+    n_unique = 0
+    # This implies contamination == 1.0
+    max_duplicates = 1 * markers.n_markers
+    for contig in contigs:
+        m = markers.markers[contig]
+        if m is not None:
+            n_markers += len(m)
+            for i in m:
+                existing = counts[i]
+                n_unique += existing == 0
+                counts[i] = existing + 1
+
+            if (n_markers - n_unique) > max_duplicates:
+                return None
+    return counts
+
+
+# This is not very effectively implemented, but I assume it does not matter.
+# This function looks at all markers that occur exactly `median` times, each of these
+# markers corresponding to a list of `median` number of contigs.
+# It picks the marker for which the smallest contig that contains it is largest.
+# The idea here is that long contigs, which contain one of the SCGs that exist exactly
+# `median` times are most likely to be close to the actual medoid
+# that Kmeans needs to find.
+# This is just one possible seeding strategy. We could also plausibly choose e.g.
+# the trio of contigs that have the most SCGs.
+def get_kmeans_seeds(
+    contigs: Iterable[ContigId],
+    markers: Markers,
+    contiglengths: Sequence[int],
+    counts: np.ndarray,
+    median: int,
+) -> list[ContigId]:
+    considered_markers = {MarkerID(i) for (i, c) in enumerate(counts) if c == median}
+    contigs_of_markers: dict[MarkerID, list[ContigId]] = defaultdict(list)
+    for contig in contigs:
+        m = markers.markers[contig]
+        if m is None:
+            continue
+        for mid in m:
+            if mid not in considered_markers:
+                continue
+            contigs_of_markers[MarkerID(mid)].append(contig)
+
+    candidate_list = list(contigs_of_markers.items())
+    pair = max(candidate_list, key=lambda x: min(contiglengths[i] for i in x[1]))
+    result = pair[1]
+    assert len(result) == median
+    return result
+
+
+def get_completeness_contamination(counts: np.ndarray) -> tuple[float, float]:
+    n_total = counts.sum()
+    n_unique = (counts > 0).sum()
+    completeness = n_unique / len(counts)
+    contamination = (n_total - n_unique) / len(counts)
+    return (completeness, contamination)
+
+
+# An arbitrary score of a bin, where higher numbers is better.
+# completeness - 5 * contamination is used by the CheckM group as a heuristic.
+def score_from_comp_cont(comp_cont: tuple[float, float]) -> float:
+    return comp_cont[0] - 5 * comp_cont[1]
+
+
+def recluster_dbscan(
+    taxonomy: Taxonomy,
+    latent: np.ndarray,
+    contiglengths: np.ndarray,
+    markers: Markers,
+    num_processes: int,
+) -> list[set[ContigId]]:
+    # Since DBScan is computationally expensive, and scales poorly with the number
+    # of contigs, we use taxonomy to only cluster within each genus
+    result: list[set[ContigId]] = []
+    for indices in group_indices_by_genus(taxonomy):
+        genus_latent = latent[indices]
+        genus_clusters = dbscan_genus(
+            genus_latent, indices, contiglengths[indices], markers, num_processes
+        )
+        result.extend(genus_clusters)
+
+    return result
+
+
+# DBScan within the subset of contigs that are annotated with a single genus
+def dbscan_genus(
+    latent_of_genus: np.ndarray,
+    original_indices: np.ndarray,
+    contiglengths_of_genus: np.ndarray,
+    markers: Markers,
+    num_processes: int,
+) -> list[set[ContigId]]:
+    assert len(latent_of_genus) == len(original_indices) == len(contiglengths_of_genus)
+    # Precompute distance matrix. This is O(N^2), but DBScan is even worse,
+    # so this pays off.
+    # TODO: Maybe we should emit a warning if this function is called with too
+    # many points such that this matrix becomes huge?
+    distance_matrix = pairwise_distances(
+        latent_of_genus, latent_of_genus, metric="cosine"
+    )
+    best_bins: tuple[int, dict[int, set[ContigId]]] = (-1, dict())
+    # The DBScan approach works by blindly clustering with different eps values
+    # (a critical parameter for DBscan), and then using SCGs to select the best
+    # subset of clusters.
+    # It's ugly and wasteful, but it does work.
+    n_worse_in_row = 0
+    for eps in EPS_VALUES:
+        print(f"Running eps: {eps}")
+        dbscan = DBSCAN(
+            eps=eps,
+            min_samples=5,
+            n_jobs=num_processes,
+            metric="precomputed",
         )
-    return contig_labels_reclustered
+        dbscan.fit(distance_matrix, sample_weight=contiglengths_of_genus)
+        these_bins: dict[int, set[ContigId]] = defaultdict(set)
+        for original_index, bin_index in zip(original_indices, dbscan.labels_):
+            these_bins[bin_index].add(ContigId(original_index))
+        score = count_good_genomes(these_bins.values(), markers)
+        print(f"Score: {score}, best: {best_bins[0]}, worse: {n_worse_in_row}")
+        if score > best_bins[0]:
+            best_bins = (score, these_bins)
+            n_worse_in_row = 0
+        elif score < best_bins[0]:
+            # This is an elif statement and not an if statement because we don't
+            # want e.g. a series of [1,1,1,1,1] genomes to be considered a performance
+            # degradation leading to early exit, when in fact, it probably means
+            # that eps is too low and should be increased.
+            n_worse_in_row += 1
+            # If we see 3 worse clusterings in a row, we exit early.
+
+            if n_worse_in_row > 2:
+                break
+
+    return list(best_bins[1].values())
+
+
+def count_good_genomes(binning: Iterable[Iterable[ContigId]], markers: Markers) -> int:
+    max_contamination = 0.3
+    min_completeness = 0.75
+    result = 0
+    for contigs in binning:
+        count = count_markers_saturated(contigs, markers)
+        if count is None:
+            continue
+        (comp, cont) = get_completeness_contamination(count)
+        if comp >= min_completeness and cont <= max_contamination:
+            result += 1
+
+    return result
+
+
+def group_indices_by_genus(
+    taxonomy: Taxonomy,
+) -> list[np.ndarray]:
+    if not taxonomy.is_canonical:
+        raise ValueError("Can only group by genus for a canonical taxonomy")
+    by_genus: dict[Optional[str], list[ContigId]] = defaultdict(list)
+    for i, tax in enumerate(taxonomy.contig_taxonomies):
+        genus = None if tax is None else tax.genus
+        by_genus[genus].append(ContigId(i))
+    return [np.array(i, dtype=np.int32) for i in by_genus.values()]
diff --git a/vamb/taxonomy.py b/vamb/taxonomy.py
new file mode 100644
index 00000000..7e88919d
--- /dev/null
+++ b/vamb/taxonomy.py
@@ -0,0 +1,175 @@
+from typing import Optional, IO
+from pathlib import Path
+from vamb.parsecontigs import CompositionMetaData
+import numpy as np
+
+
+class ContigTaxonomy:
+    """
+    Hierarchical taxonomy of some contig.
+    If `is_canonical`, the ranks are assumed to be domain, phylum, class,
+    order, family, genus, species, in that order.
+    The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"]
+    is a valid (canonical) taxonomy for a human.
+    """
+
+    __slots__ = ["ranks"]
+
+    def __init__(self, ranks: list[str], is_canonical: bool = False):
+        if is_canonical and len(ranks) > 7:
+            raise ValueError(
+                "For a canonical ContigTaxonomy, there must be at most 7 ranks"
+            )
+
+        self.ranks = ranks
+
+    @classmethod
+    def from_semicolon_sep(cls, s: str, is_canonical: bool = False):
+        return cls(s.split(";"), is_canonical)
+
+    @property
+    def genus(self) -> Optional[str]:
+        if len(self.ranks) < 6:
+            return None
+        return self.ranks[5]
+
+
+class Taxonomy:
+    """
+    * contig_taxonomies: An Optional[ContigTaxonomy] for every contig given by the
+      CompositionMetaData used to instantiate
+    * refhash: Refhash of CompositionMetaData used to instantiate
+    * is_canonical: If the taxonomy uses the canonical seven ranks
+      (domain, phylum, class, order, family, genus, species).
+    """
+
+    __slots__ = ["contig_taxonomies", "refhash", "is_canonical"]
+
+    @property
+    def nseqs(self) -> int:
+        return len(self.contig_taxonomies)
+
+    @classmethod
+    def from_file(
+        cls, tax_file: Path, metadata: CompositionMetaData, is_canonical: bool
+    ):
+        observed = cls.parse_tax_file(tax_file, is_canonical)
+        return cls.from_observed(observed, metadata, is_canonical)
+
+    @classmethod
+    def from_observed(
+        cls,
+        observed_taxonomies: list[tuple[str, ContigTaxonomy]],
+        metadata: CompositionMetaData,
+        is_canonical: bool,
+    ):
+        index_of_contigname: dict[str, int] = {
+            c: i for (i, c) in enumerate(metadata.identifiers)
+        }
+        contig_taxonomies: list[Optional[ContigTaxonomy]] = [None] * len(
+            metadata.identifiers
+        )
+        for contigname, taxonomy in observed_taxonomies:
+            index = index_of_contigname.get(contigname)
+            if index is None:
+                raise ValueError(
+                    f'When parsing taxonomy, found contigname "{contigname}", '
+                    "but no sequence of that name is in the FASTA file"
+                )
+            existing = contig_taxonomies[index]
+            if existing is not None:
+                raise ValueError(
+                    f'Duplicate contigname when parsing taxonomy: "{contigname}"'
+                )
+            contig_taxonomies[index] = taxonomy
+        return cls(contig_taxonomies, metadata.refhash, is_canonical)
+
+    def __init__(
+        self,
+        contig_taxonomies: list[Optional[ContigTaxonomy]],
+        refhash: bytes,
+        is_canonical: bool,
+    ):
+        self.contig_taxonomies = contig_taxonomies
+        self.refhash = refhash
+        self.is_canonical = is_canonical
+
+    @staticmethod
+    def parse_tax_file(
+        path: Path, force_canonical: bool
+    ) -> list[tuple[str, ContigTaxonomy]]:
+        with open(path) as file:
+            result: list[tuple[str, ContigTaxonomy]] = []
+            lines = filter(None, map(str.rstrip, file))
+            header = next(lines, None)
+            if header is None or not header.startswith("contigs\tpredictions"):
+                raise ValueError(
+                    'In taxonomy file, expected header to begin with "contigs\\tpredictions"'
+                )
+            for line in lines:
+                (contigname, taxonomy, *_) = line.split("\t")
+                result.append(
+                    (
+                        contigname,
+                        ContigTaxonomy.from_semicolon_sep(taxonomy, force_canonical),
+                    )
+                )
+
+        return result
+
+
+class PredictedContigTaxonomy:
+    slots = ["contig_taxonomy", "probs"]
+
+    def __init__(self, tax: ContigTaxonomy, probs: np.ndarray):
+        if len(probs) != len(tax.ranks):
+            raise ValueError("The length of probs must equal that of ranks")
+        self.tax = tax
+        self.probs = probs
+
+
+class PredictedTaxonomy:
+    "Output of Taxometer"
+
+    __slots__ = ["contig_taxonomies", "refhash", "is_canonical"]
+
+    def __init__(
+        self,
+        taxonomies: list[PredictedContigTaxonomy],
+        metadata: CompositionMetaData,
+        is_canonical: bool,
+    ):
+        if len(taxonomies) != len(metadata.identifiers):
+            raise ValueError("Length of taxonomies must match that of identifiers")
+
+        self.contig_taxonomies = taxonomies
+        self.refhash = metadata.refhash
+        self.is_canonical = is_canonical
+
+    def to_taxonomy(self) -> Taxonomy:
+        lst: list[Optional[ContigTaxonomy]] = [p.tax for p in self.contig_taxonomies]
+        return Taxonomy(lst, self.refhash, self.is_canonical)
+
+    @property
+    def nseqs(self) -> int:
+        return len(self.contig_taxonomies)
+
+    def write_as_tsv(self, file: IO[str], comp_metadata: CompositionMetaData):
+        if self.refhash != comp_metadata.refhash:
+            raise ValueError(
+                "Refhash of comp_metadata and predicted taxonomy must match"
+            )
+        assert self.nseqs == comp_metadata.nseqs
+        print("contigs\tpredictions\tlengths\tscores", file=file)
+        for i in range(self.nseqs):
+            tax = self.contig_taxonomies[i]
+            ranks_str = ";".join(tax.tax.ranks)
+            probs_str = ";".join([str(round(i, 5)) for i in tax.probs])
+            print(
+                comp_metadata.identifiers[i],
+                ranks_str,
+                comp_metadata.lengths[i],
+                probs_str,
+                file=file,
+                sep="\t",
+            )
diff --git a/vamb/taxvamb_encode.py b/vamb/taxvamb_encode.py
index 79ea5333..1541b751 100644
--- a/vamb/taxvamb_encode.py
+++ b/vamb/taxvamb_encode.py
@@ -23,7 +23,7 @@
 import vamb.semisupervised_encode as _semisupervised_encode
 import vamb.encode as _encode
 import vamb.hloss_misc as _hloss
-from vamb.vambtools import ContigTaxonomy
+from vamb.taxonomy import ContigTaxonomy
 
 
 def make_graph(
diff --git a/vamb/vambtools.py b/vamb/vambtools.py
index c60d9398..b43c683c 100644
--- a/vamb/vambtools.py
+++ b/vamb/vambtools.py
@@ -9,7 +9,7 @@
 import collections as _collections
 from itertools import zip_longest
 from hashlib import md5 as _md5
-from collections.abc import Iterable, Iterator, Sequence
+from collections.abc import Iterable, Iterator
 from typing import Optional, IO, Union
 from pathlib import Path
 from loguru import logger
@@ -223,66 +223,6 @@ def clear(self) -> None:
         self.length = 0
 
 
-class ContigTaxonomy:
-    """
-    Hierarchical taxonomy of some contig.
-    If `is_canonical`, the ranks are assumed to be domain, phylum, class,
-    order, family, genus, species, in that order.
-    The taxonomy may be arbitrarily truncated, e.g. ["Eukaryota", "Chordata"]
-    is a valid (canonical) taxonomy for a human.
-    """
-
-    __slots__ = ["is_canonical", "ranks"]
-
-    def __init__(self, ranks: list[str], is_canonical: bool = False):
-        if is_canonical and len(ranks) > 7:
-            raise ValueError(
-                "For a canonical ContigTaxonomy, there must be at most 7 ranks"
-            )
-
-        self.ranks = ranks
-        self.is_canonical = is_canonical
-
-    @classmethod
-    def from_semicolon_sep(cls, s: str, is_canonical: bool = False):
-        return cls(s.split(";"), is_canonical)
-
-    @property
-    def genus(self) -> Optional[str]:
-        if not self.is_canonical:
-            raise ValueError("Taxonomy must be canonical to get genus")
-        if len(self.ranks) < 6:
-            return None
-        return self.ranks[5]
-
-
-def parse_taxonomy(
-    input_file: Path, contignames: Sequence[str], is_canonical: bool
-) -> list[Optional[ContigTaxonomy]]:
-    result: list[Optional[ContigTaxonomy]] = [None] * len(contignames)
-    index_of_contigname = {c: i for (i, c) in enumerate(contignames)}
-    with open(input_file) as file:
-        lines = filter(None, map(str.strip, file))
-        header = next(lines)
-        assert header.startswith("contigs\tpredictions")
-        for line in lines:
-            (contigname, taxonomy, *_) = line.split("\t")
-            index = index_of_contigname.get(contigname)
-            if index is None:
-                raise ValueError(
-                    f'When parsing taxonomy, found contigname "{contigname}", '
-                    "but no sequence of that name is in the FASTA file"
-                )
-            existing = result[index]
-            if existing is not None:
-                raise ValueError(
-                    f'Duplicate contigname when parsing taxonomy: "{contigname}"'
-                )
-            result[index] = ContigTaxonomy.from_semicolon_sep(taxonomy, is_canonical)
-
-    return result
-
-
 def zscore(
     array: _np.ndarray, axis: Optional[int] = None, inplace: bool = False
 ) -> _np.ndarray: