From 2c61993f5f8a9f42ec25b68fc39ba6a98f0c9ed0 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Fri, 3 Jul 2020 14:55:52 +0200
Subject: [PATCH 01/13] Added central storage for clustering parameters

All clustering associated parameters are
now collected in the single ClusteringParameters
class.
---
 .../org/spectra/cluster/tools/CliOptions.java |  17 +-
 .../LocalParallelBinnedClusteringTool.java    |  37 +--
 .../cluster/util/ClusteringParameters.java    | 241 ++++++++++++++++++
 .../cluster/util/DefaultParameters.java       | 103 --------
 src/main/resources/application.properties     |   4 +-
 5 files changed, 266 insertions(+), 136 deletions(-)
 create mode 100644 src/main/java/org/spectra/cluster/util/ClusteringParameters.java
 delete mode 100644 src/main/java/org/spectra/cluster/util/DefaultParameters.java

diff --git a/src/main/java/org/spectra/cluster/tools/CliOptions.java b/src/main/java/org/spectra/cluster/tools/CliOptions.java
index 535b365..93c71d5 100644
--- a/src/main/java/org/spectra/cluster/tools/CliOptions.java
+++ b/src/main/java/org/spectra/cluster/tools/CliOptions.java
@@ -20,7 +20,7 @@ public enum OPTIONS {
         PRECURSOR_TOLERANCE("precursor.tolerance", "p"),
         FRAGMENT_PRECISION("fragment.precision", "f"),
 
-        MAJOR_PEAK_JOBS("major.peak.jobs", "m"),
+        N_THREADS("n.threads", "t"),
 
         START_THRESHOLD("threshold.start", "s"),
         END_THRESHOLD("threshold.end", "e"),
@@ -47,7 +47,8 @@ public enum OPTIONS {
         ADVANCED_NUMBER_PREFILTERED_PEAKS("xn.prefiltered.peaks", "pp"),
         ADVANCED_LEARN_CDF("x.learn.cdf", "lcdf"),
         ADVANCED_LOAD_CDF_FILE("x.load.cdf", "lcdf"),
-        ADVANCED_DISABLE_MGF_COMMENTS("disable.mgf.comments", "dmc");
+        ADVANCED_DISABLE_MGF_COMMENTS("disable.mgf.comments", "dmc"),
+        ADVANCED_MIN_INITIAL_PEAKS("x.min.initial.shared.peaks", "mip");
 
         private String value;
         private String longValue;
@@ -126,9 +127,9 @@ public String getLongValue() {
 
         Option majorPeakJobs = OptionBuilder
                 .hasArg()
-                .withDescription("Number of threads to use for major peak clustering.")
-                .withLongOpt(OPTIONS.MAJOR_PEAK_JOBS.getLongValue())
-                .create(OPTIONS.MAJOR_PEAK_JOBS.getValue());
+                .withDescription("Number of threads to use during the clustering process.")
+                .withLongOpt(OPTIONS.N_THREADS.getLongValue())
+                .create(OPTIONS.N_THREADS.getValue());
         options.addOption(majorPeakJobs);
 
         Option binaryDirectory = OptionBuilder
@@ -224,6 +225,12 @@ public String getLongValue() {
                 .withLongOpt(OPTIONS.ADVANCED_DISABLE_MGF_COMMENTS.getLongValue())
                 .create(OPTIONS.ADVANCED_DISABLE_MGF_COMMENTS.getValue());
         options.addOption(xDisableMgfComments);
+
+        Option xMinInitialSharedPeaks = OptionBuilder
+                .withDescription("(Advanced option) The number two spectra must share to be compared using the probabilistic comparison method.")
+                .withLongOpt(OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getLongValue())
+                .create(OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue());
+        options.addOption(xMinInitialSharedPeaks);
     }
 
     public static Options getOptions() {
diff --git a/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java b/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java
index ab1b823..5ec8552 100644
--- a/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java
+++ b/src/main/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringTool.java
@@ -1,14 +1,12 @@
 package org.spectra.cluster.tools;
 
-import lombok.Data;
-import lombok.extern.slf4j.Slf4j;
 import io.github.bigbio.pgatk.io.common.PgatkIOException;
 import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
 import io.github.bigbio.pgatk.io.objectdb.LongObject;
 import io.github.bigbio.pgatk.io.objectdb.ObjectsDB;
+import lombok.Data;
+import lombok.extern.slf4j.Slf4j;
 import org.spectra.cluster.binning.IClusterBinner;
-import org.spectra.cluster.cdf.INumberOfComparisonAssessor;
-import org.spectra.cluster.engine.GreedyClusteringEngine;
 import org.spectra.cluster.engine.IClusteringEngine;
 import org.spectra.cluster.exceptions.SpectraClusterException;
 import org.spectra.cluster.io.cluster.ClusterStorageFactory;
@@ -16,8 +14,7 @@
 import org.spectra.cluster.model.cluster.GreedySpectralCluster;
 import org.spectra.cluster.model.cluster.ICluster;
 import org.spectra.cluster.model.cluster.IClusterProperties;
-import org.spectra.cluster.predicates.IComparisonPredicate;
-import org.spectra.cluster.similarity.IBinarySpectrumSimilarity;
+import org.spectra.cluster.util.ClusteringParameters;
 
 import java.io.File;
 import java.util.Arrays;
@@ -37,12 +34,8 @@ public class LocalParallelBinnedClusteringTool {
     private final IClusterBinner binner;
     private final Class clusterClass;
 
-    public void runClustering(IClusterProperties[] clusters, IMapStorage<ICluster> clusterStorage, File resultFile,
-                              int precursorTolerance, float thresholdStart, float thresholdEnd,
-                              int clusteringRounds, IBinarySpectrumSimilarity similarityMeasure,
-                              INumberOfComparisonAssessor numberOfComparisonAssessor,
-                              IComparisonPredicate<ICluster> firstRoundPredicate,
-                              int consensusSpectrumNoiseFilterIncrement) throws SpectraClusterException {
+    public void runClustering(IClusterProperties[] clusters, IMapStorage<ICluster> clusterStorage,
+                              ClusteringParameters clusteringParameters) throws SpectraClusterException {
         log.debug("------ Local parallel binned clustering -----");
 
         try {
@@ -64,8 +57,7 @@ public void runClustering(IClusterProperties[] clusters, IMapStorage<ICluster> c
 
             // cluster the initially binned clusters
             IClusterProperties[][] firstRoundResult = clusterMapped(binnedClusterIds, clusterStorage, firstRoundStorage,
-                    precursorTolerance, thresholdStart, thresholdEnd, clusteringRounds, similarityMeasure,
-                    numberOfComparisonAssessor, firstRoundPredicate, consensusSpectrumNoiseFilterIncrement);
+                    clusteringParameters);
 
             // close the initial storage
             clusterStorage.close();
@@ -87,15 +79,14 @@ public void runClustering(IClusterProperties[] clusters, IMapStorage<ICluster> c
 
             // run the clustering again on the re-binned ids
             IClusterProperties[][] secondRoundResult = clusterMapped(rebinnedClusterIds, firstRoundStorage, secondRoundStorage,
-                    precursorTolerance, thresholdStart, thresholdEnd, clusteringRounds, similarityMeasure,
-                    numberOfComparisonAssessor, firstRoundPredicate, consensusSpectrumNoiseFilterIncrement);
+                    clusteringParameters);
 
             // close the first round storage - thereby deleting the temporary data
             firstRoundStorage.close();
 
             // write the final clusters to file
             ObjectDBGreedyClusterStorage writer = new ObjectDBGreedyClusterStorage(
-                    new ObjectsDB(resultFile.getAbsolutePath(), true));
+                    new ObjectsDB(clusteringParameters.getOutputFile().getAbsolutePath(), true));
 
             // write all clusters to storage
             Arrays.stream(secondRoundResult).flatMap(Arrays::stream)
@@ -149,20 +140,12 @@ private synchronized void writeClusters(IMapStorage<ICluster> storage, ICluster[
     }
 
     private IClusterProperties[][] clusterMapped(String[][] binnedClusterIds, IMapStorage<ICluster> clusterStorage,
-                                                 IMapStorage<ICluster> resultStorage, int precursorTolerance,
-                                                 float thresholdStart, float thresholdEnd,
-                                                 int clusteringRounds, IBinarySpectrumSimilarity similarityMeasure,
-                                                 INumberOfComparisonAssessor numberOfComparisonAssessor,
-                                                 IComparisonPredicate<ICluster> firstRoundPredicate,
-                                                 int consensusSpectrumNoiseFilterIncrement) throws Exception {
+                                                 IMapStorage<ICluster> resultStorage, ClusteringParameters clusteringParameters) throws Exception {
         // start the clustering
         ForkJoinPool clusteringPool = new ForkJoinPool(parallelJobs, ForkJoinPool.defaultForkJoinWorkerThreadFactory, null, true);
         return clusteringPool.submit(() -> Arrays.stream(binnedClusterIds).parallel().map((String[] clusterIds) -> {
             try {
-                // TODO: replace with factory for clustering engine
-                IClusteringEngine engine = new GreedyClusteringEngine(precursorTolerance, thresholdStart, thresholdEnd,
-                        clusteringRounds, similarityMeasure, numberOfComparisonAssessor, firstRoundPredicate,
-                        consensusSpectrumNoiseFilterIncrement);
+                IClusteringEngine engine = clusteringParameters.createGreedyClusteringEngine();
 
                 // load the clusters - parallel reads are not a problem
                 ICluster[] loadedClusters = new ICluster[clusterIds.length];
diff --git a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
new file mode 100644
index 0000000..f3acaa5
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
@@ -0,0 +1,241 @@
+package org.spectra.cluster.util;
+
+import lombok.Data;
+import org.apache.commons.cli.CommandLine;
+import org.spectra.cluster.cdf.SpectraPerBinNumberComparisonAssessor;
+import org.spectra.cluster.engine.GreedyClusteringEngine;
+import org.spectra.cluster.filter.rawpeaks.*;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.normalizer.BasicIntegerNormalizer;
+import org.spectra.cluster.normalizer.HighResolutionMzBinner;
+import org.spectra.cluster.normalizer.IMzBinner;
+import org.spectra.cluster.normalizer.TideBinner;
+import org.spectra.cluster.predicates.IComparisonPredicate;
+import org.spectra.cluster.predicates.SameChargePredicate;
+import org.spectra.cluster.predicates.ShareNComparisonPeaksPredicate;
+import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+import org.spectra.cluster.tools.CliOptions;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URISyntaxException;
+import java.util.Properties;
+
+/**
+ * This code is licensed under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * ==Overview==
+ *
+ * @author ypriverol on 18/10/2018.
+ */
+@Data
+public class ClusteringParameters {
+
+    private String binaryDirectory;
+    private boolean reuseBinary;
+    private boolean fastMode;
+    private Integer clusterRounds;
+    private boolean filterReportPeaks;
+    private Integer numberHigherPeaks;
+    private Double precursorIonTolerance;
+    private String fragmentIonPrecision;
+    private boolean ignoreCharge;
+
+    private Float thresholdStart;
+    private Float thresholdEnd;
+    private int nInitiallySharedPeaks;
+    private int minNumberOfComparisons;
+
+    private File outputFile;
+
+    private int nThreads;
+
+
+    public ClusteringParameters(){
+
+        try {
+            Properties properties = readProperties();
+            setProperties(properties);
+
+        } catch (URISyntaxException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    private void setProperties(Properties properties) {
+        if(properties.containsKey("precursor.tolerance"))
+            this.precursorIonTolerance = Double.parseDouble(properties.getProperty("precursor.tolerance").trim());
+        if(properties.containsKey("fragment.precision"))
+            this.fragmentIonPrecision = properties.getProperty("fragment.precision");
+        if(properties.containsKey("n.threads"))
+            this.nThreads = Integer.parseInt(properties.getProperty("jobs.precision"));
+        if(properties.containsKey("threshold.start"))
+            this.thresholdStart =  Float.parseFloat(properties.getProperty("threshold.start"));
+        if(properties.containsKey("threshold.end"))
+            this.thresholdEnd   =  Float.parseFloat(properties.getProperty("threshold.end"));
+        if(properties.containsKey("number.higher.peaks"))
+            this.numberHigherPeaks = Integer.parseInt(properties.getProperty("number.higher.peaks"));
+        if(properties.containsKey("cluster.rounds"))
+            this.clusterRounds = Integer.parseInt(properties.getProperty("cluster.rounds"));
+        if(properties.containsKey("binary.temp.directory"))
+            this.binaryDirectory = properties.getProperty("binary.temp.directory");
+        if(properties.containsKey("reuse.binary.files"))
+            this.reuseBinary = Boolean.parseBoolean(properties.getProperty("reuse.binary.files"));
+        if(properties.containsKey("ignore.charge"))
+            this.ignoreCharge = Boolean.parseBoolean(properties.getProperty("ignore.charge"));
+        if(properties.containsKey("cluster.fast.mode"))
+            this.fastMode = Boolean.parseBoolean(properties.getProperty("cluster.fast.mode"));
+        if(properties.containsKey("filters.remove.reporter.peaks"))
+            this.filterReportPeaks = Boolean.parseBoolean(properties.getProperty("filters.remove.reporter.peaks"));
+        if(properties.containsKey("initially.shared.peaks"))
+            this.nInitiallySharedPeaks = Integer.parseInt(properties.getProperty("initially.shared.peaks"));
+        if(properties.containsKey("x.min.comparisons"))
+            this.minNumberOfComparisons = Integer.parseInt(properties.getProperty("x.min.comparisons"));
+    }
+
+    public Properties readProperties() throws URISyntaxException {
+        Properties properties = new Properties();
+        InputStream output;
+
+        try {
+            output = getClass().getClassLoader().getResourceAsStream("application.properties");
+            properties.load(output);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return properties;
+    }
+
+    /**
+     * Adapt the default parameters based on the set command line arguments.
+     *
+     * @param commandLine A command line object.
+     */
+    public void mergeCommandLineArgs(CommandLine commandLine) {
+        if (commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_PATH.getValue()))
+            outputFile = new File(commandLine.getOptionValue(CliOptions.OPTIONS.OUTPUT_PATH.getValue()));
+
+        // NUMBER OF ROUNDS
+        if (commandLine.hasOption(CliOptions.OPTIONS.ROUNDS.getValue()))
+            clusterRounds = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ROUNDS.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.START_THRESHOLD.getValue()))
+            thresholdStart = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.START_THRESHOLD.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.END_THRESHOLD.getValue()))
+            thresholdEnd = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.END_THRESHOLD.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue()))
+            precursorIonTolerance = Double.parseDouble(commandLine.getOptionValue(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue()))
+            fragmentIonPrecision = commandLine.getOptionValue(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue());
+
+        ignoreCharge = commandLine.hasOption(CliOptions.OPTIONS.IGNORE_CHARGE.getValue());
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue()))
+            binaryDirectory = commandLine.getOptionValue(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue());
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue()))
+            minNumberOfComparisons = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue()))
+            nInitiallySharedPeaks = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_INITIAL_PEAKS.getValue()));
+
+        if (commandLine.hasOption(CliOptions.OPTIONS.N_THREADS.getValue()))
+            nThreads = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.N_THREADS.getValue()));
+    }
+
+    public void mergeParameters(String configFile) throws IOException {
+        File propertiesFactoryBean = new File(configFile);
+        Properties newProperties = new Properties();
+        InputStream output = new FileInputStream(propertiesFactoryBean);
+        newProperties.load(output);
+        setProperties(newProperties);
+    }
+
+    // ----------
+    // A collection of functions to return clustering objects matching the
+    // parameters
+
+    /**
+     * Get the precursor tolerance as an integer.
+     *
+     * @return The precursor tolerance
+     */
+    public int getIntPrecursorTolerance() {
+        return (int) Math.round(precursorIonTolerance * (double) BasicIntegerNormalizer.MZ_CONSTANT);
+    }
+
+    /**
+     * Creates a new GreedyClusteringEngine based on the currently set parameters.
+     *
+     * Note: The validity of these parameters is not checked in this function but
+     * must be checked before calling it.
+     *
+     * @return A new instance of a GreedyClusteringEngine
+     * @throws Exception
+     */
+    public GreedyClusteringEngine createGreedyClusteringEngine() throws Exception {
+        int precursorTolerance = getIntPrecursorTolerance();
+
+        SpectraPerBinNumberComparisonAssessor numberOfComparisonAssessor = new SpectraPerBinNumberComparisonAssessor(
+                precursorTolerance * 2, minNumberOfComparisons, BasicIntegerNormalizer.MZ_CONSTANT * 2500
+        );
+
+        IComparisonPredicate<ICluster> firstRoundPredicate = new ShareNComparisonPeaksPredicate(
+                nInitiallySharedPeaks);
+
+        if (!ignoreCharge) {
+            firstRoundPredicate = new SameChargePredicate().and(firstRoundPredicate);
+        }
+
+        int windowSizeNoiseFilter = (fragmentIonPrecision.equalsIgnoreCase("high")) ? 3000 : 100;
+
+        GreedyClusteringEngine engine = new GreedyClusteringEngine(
+                precursorTolerance,
+                thresholdStart, thresholdEnd, clusterRounds, new CombinedFisherIntensityTest(),
+                numberOfComparisonAssessor, firstRoundPredicate,
+                windowSizeNoiseFilter);
+
+        return engine;
+    }
+
+    /**
+     * Create a new loading filter based on the currently set parameters.
+     *
+     * Note: The validity of these parameters is not checked in this function but
+     * must be checked before calling it.
+     *
+     * @return A new instance of an IRawSpectrumFunction
+     */
+    public IRawSpectrumFunction createLoadingFilter() {
+        // set an approximate fragment tolerance for the filters
+        double fragmentTolerance = (fragmentIonPrecision.equalsIgnoreCase("high")) ? 0.01 : 0.5;
+
+        return new RemoveImpossiblyHighPeaksFunction()
+                .specAndThen(new RemovePrecursorPeaksFunction(fragmentTolerance))
+                .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(numberHigherPeaks)));
+
+    }
+
+    /**
+     * Creates a new instance of the matching m/z binner.
+     *
+     * Note: The validity of these parameters is not checked in this function but
+     * must be checked before calling it.
+     *
+     * @return A new IMzBinner object.
+     */
+    public IMzBinner createMzBinner() {
+        return (fragmentIonPrecision.equalsIgnoreCase("high")) ?
+                new HighResolutionMzBinner() : new TideBinner();
+    }
+}
diff --git a/src/main/java/org/spectra/cluster/util/DefaultParameters.java b/src/main/java/org/spectra/cluster/util/DefaultParameters.java
deleted file mode 100644
index 2adc120..0000000
--- a/src/main/java/org/spectra/cluster/util/DefaultParameters.java
+++ /dev/null
@@ -1,103 +0,0 @@
-package org.spectra.cluster.util;
-
-import lombok.Data;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URISyntaxException;
-import java.util.Properties;
-
-/**
- * This code is licensed under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * ==Overview==
- *
- * @author ypriverol on 18/10/2018.
- */
-@Data
-public class DefaultParameters {
-
-    private String binaryDirectory;
-    private boolean reuseBinary;
-    private boolean fastMode;
-    private Integer clusterRounds;
-    private boolean filterReportPeaks;
-    private Integer numberHigherPeaks;
-    private Double precursorIonTolerance;
-    private String fragmentIonPrecision;
-    private boolean ignoreCharge;
-
-    private Float thresholdStart;
-    private Float thresholdEnd;
-    private int nInitiallySharedPeaks;
-    private int minNumberOfComparisons;
-
-
-    public DefaultParameters(){
-
-        try {
-            Properties properties = readProperties();
-            setProperties(properties);
-
-        } catch (URISyntaxException e) {
-            e.printStackTrace();
-        }
-
-    }
-
-    private void setProperties(Properties properties) {
-        if(properties.containsKey("precursor.tolerance"))
-            this.precursorIonTolerance = Double.parseDouble(properties.getProperty("precursor.tolerance").trim());
-        if(properties.containsKey("fragment.precision"))
-            this.fragmentIonPrecision = properties.getProperty("fragment.precision");
-        if(properties.containsKey("threshold.start"))
-            this.thresholdStart =  Float.parseFloat(properties.getProperty("threshold.start"));
-        if(properties.containsKey("threshold.end"))
-            this.thresholdEnd   =  Float.parseFloat(properties.getProperty("threshold.end"));
-        if(properties.containsKey("number.higher.peaks"))
-            this.numberHigherPeaks = Integer.parseInt(properties.getProperty("number.higher.peaks"));
-        if(properties.containsKey("cluster.rounds"))
-            this.clusterRounds = Integer.parseInt(properties.getProperty("cluster.rounds"));
-        if(properties.containsKey("binary.temp.directory"))
-            this.binaryDirectory = properties.getProperty("binary.temp.directory");
-        if(properties.containsKey("reuse.binary.files"))
-            this.reuseBinary = Boolean.parseBoolean(properties.getProperty("reuse.binary.files"));
-        if(properties.containsKey("ignore.charge"))
-            this.ignoreCharge = Boolean.parseBoolean(properties.getProperty("ignore.charge"));
-        if(properties.containsKey("cluster.fast.mode"))
-            this.fastMode = Boolean.parseBoolean(properties.getProperty("cluster.fast.mode"));
-        if(properties.containsKey("filters.remove.reporter.peaks"))
-            this.filterReportPeaks = Boolean.parseBoolean(properties.getProperty("filters.remove.reporter.peaks"));
-        if(properties.containsKey("initially.shared.peaks"))
-            this.nInitiallySharedPeaks = Integer.parseInt(properties.getProperty("initially.shared.peaks"));
-        if(properties.containsKey("x.min.comparisons"))
-            this.minNumberOfComparisons = Integer.parseInt(properties.getProperty("x.min.comparisons"));
-    }
-
-    public Properties readProperties() throws URISyntaxException {
-        Properties properties = new Properties();
-        InputStream output;
-
-        try {
-            output = getClass().getClassLoader().getResourceAsStream("application.properties");
-            properties.load(output);
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-        return properties;
-    }
-
-    public void mergeParameters(String configFile) throws IOException {
-        File propertiesFactoryBean = new File(configFile);
-        Properties newProperties = new Properties();
-        InputStream output = new FileInputStream(propertiesFactoryBean);
-        newProperties.load(output);
-        setProperties(newProperties);
-    }
-}
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index cb28b27..9e86c47 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -1,7 +1,6 @@
 precursor.tolerance=1
 fragment.precision=low
 
-
 # Clustering execution parameters
 threshold.start=1.0F
 threshold.end=0.99F
@@ -19,6 +18,9 @@ number.higher.peaks=40
 initially.shared.peaks=5
 filters.remove.reporter.peaks=true
 
+# number of parallel jobs
+n.threads = 2
+
 # The minimum number of comparisons is derived from the data. The set number
 # is used as an additional minimum
 x.min.comparisons=0
\ No newline at end of file

From 4ef3f41ec46dd0ca374ab81e61ff3bf4e8162b9b Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Wed, 29 Jul 2020 14:37:51 +0200
Subject: [PATCH 02/13] Completed migration to central parameter storage

---
 pom.xml                                       |   4 +
 .../cluster/tools/SpectraClusterTool.java     | 359 +++++++++---------
 .../cluster/util/ClusteringParameters.java    |   2 +-
 ...LocalParallelBinnedClusteringToolTest.java |  20 +-
 4 files changed, 198 insertions(+), 187 deletions(-)

diff --git a/pom.xml b/pom.xml
index 33fc55e..220800c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -220,6 +220,10 @@
             <id>sonatype-snapshopt</id>
             <url>https://oss.sonatype.org/content/repositories/snapshots</url>
         </repository>
+        <repository>
+            <id>nuiton</id>
+            <url>http://maven.nuiton.org/release/</url>
+        </repository>
     </repositories>
 
 </project>
diff --git a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
index 66e1d44..59f4bd6 100644
--- a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
+++ b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
@@ -1,37 +1,37 @@
 package org.spectra.cluster.tools;
 
+import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.PropertyStorageFactory;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.PosixParser;
-import io.github.bigbio.pgatk.io.objectdb.ObjectsDB;
-import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
-import io.github.bigbio.pgatk.io.properties.PropertyStorageFactory;
-import org.spectra.cluster.cdf.SpectraPerBinNumberComparisonAssessor;
+import org.spectra.cluster.binning.IClusterBinner;
+import org.spectra.cluster.binning.SimilarSizedClusterBinner;
 import org.spectra.cluster.engine.GreedyClusteringEngine;
-import org.spectra.cluster.engine.IClusteringEngine;
 import org.spectra.cluster.exceptions.MissingParameterException;
 import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction;
-import org.spectra.cluster.filter.rawpeaks.*;
-import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage;
+import org.spectra.cluster.io.cluster.ClusterStorageFactory;
 import org.spectra.cluster.io.spectra.MzSpectraReader;
+import org.spectra.cluster.model.cluster.GreedySpectralCluster;
 import org.spectra.cluster.model.cluster.ICluster;
-import org.spectra.cluster.normalizer.*;
-import org.spectra.cluster.predicates.IComparisonPredicate;
-import org.spectra.cluster.predicates.SameChargePredicate;
-import org.spectra.cluster.predicates.ShareNComparisonPeaksPredicate;
-import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+import org.spectra.cluster.model.cluster.IClusterProperties;
+import org.spectra.cluster.normalizer.BasicIntegerNormalizer;
+import org.spectra.cluster.normalizer.MaxPeakNormalizer;
 import org.spectra.cluster.tools.utils.IProgressListener;
 import org.spectra.cluster.tools.utils.ProgressUpdate;
-import org.spectra.cluster.util.DefaultParameters;
+import org.spectra.cluster.util.ClusteringParameters;
 
 import java.io.File;
-import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.io.IOException;
 import java.time.Duration;
 import java.time.LocalDateTime;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
 
 /**
  * This code is licensed under the Apache License, Version 2.0 (the
@@ -47,12 +47,6 @@
 @Slf4j
 public class SpectraClusterTool implements IProgressListener {
 
-    public static final boolean DELETE_TEMPORARY_CLUSTERING_RESULTS = true;
-
-    public DefaultParameters defaultParameters = new DefaultParameters();
-
-    private boolean verbose;
-
     public static void main(String[] args) {
         SpectraClusterTool instance = new SpectraClusterTool();
         instance.run(args);
@@ -62,6 +56,7 @@ private void run(String[] args) {
         CommandLineParser parser = new PosixParser();
 
         try {
+            // parse the command line
             CommandLine commandLine = parser.parse(CliOptions.getOptions(), args);
 
             // HELP
@@ -70,196 +65,200 @@ private void run(String[] args) {
                 return;
             }
 
-            // File input
-            String[] peakFiles = commandLine.getArgs();
-
-            if (peakFiles.length < 1) {
-                printUsage();
-                throw new MissingParameterException("Missing input files");
-            }
-
-            // RESULT FILE PATH
-            if (!commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_PATH.getValue()))
-                throw new MissingParameterException("Missing required option " +
-                        CliOptions.OPTIONS.OUTPUT_PATH.getValue());
-
-            File finalResultFile = new File(commandLine.getOptionValue(CliOptions.OPTIONS.OUTPUT_PATH.getValue()));
-
-            if(commandLine.hasOption(CliOptions.OPTIONS.CONFIG_FILE.getValue()))
-                defaultParameters.mergeParameters(commandLine.getOptionValue(CliOptions.OPTIONS.CONFIG_FILE.getValue()));
-
-            if (finalResultFile.exists())
-                throw new Exception("Result file " + finalResultFile + " already exists");
-
-            // NUMBER OF ROUNDS
-            int rounds = defaultParameters.getClusterRounds();
-            if (commandLine.hasOption(CliOptions.OPTIONS.ROUNDS.getValue()))
-                rounds = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ROUNDS.getValue()));
-
-            // START THRESHOLD
-            float startThreshold = defaultParameters.getThresholdStart();
-            if (commandLine.hasOption(CliOptions.OPTIONS.START_THRESHOLD.getValue()))
-                startThreshold = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.START_THRESHOLD.getValue()));
-
-            // END THRESHOLD
-            float endThreshold = defaultParameters.getThresholdEnd();
-            if (commandLine.hasOption(CliOptions.OPTIONS.END_THRESHOLD.getValue()))
-                endThreshold = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.END_THRESHOLD.getValue()));
-
-            // PRECURSOR TOLERANCE
-            // TODO: Add support for precursor tolerance
-            double precursorTolerance = defaultParameters.getPrecursorIonTolerance();
-            if (commandLine.hasOption(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue())) {
-                precursorTolerance = Float.parseFloat(commandLine.getOptionValue(CliOptions.OPTIONS.PRECURSOR_TOLERANCE.getValue()));
-            }
-            // convert the precursor tolerance to int space
-            int binnedPrecursorTolerance = (int) Math.round(precursorTolerance * (double) BasicIntegerNormalizer.MZ_CONSTANT);
-
-            // FRAGMENT TOLERANCE
-            // TODO: Add support for fragment tolerance
-            String fragmentPrecision = defaultParameters.getFragmentIonPrecision();
-            if (commandLine.hasOption(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue())) {
-                fragmentPrecision = commandLine.getOptionValue(CliOptions.OPTIONS.FRAGMENT_PRECISION.getValue());
-            }
-            if (!"high".equalsIgnoreCase(fragmentPrecision) && !"low".equalsIgnoreCase(fragmentPrecision)) {
-                throw new Exception("Invalid fragment precision set. Allowed values are 'low' and 'high'");
-            }
-
-            // IGNORE CHARGE STATE
-            boolean ignoreCharge = defaultParameters.isIgnoreCharge();
-            if (commandLine.hasOption(CliOptions.OPTIONS.IGNORE_CHARGE.getValue())) {
-                ignoreCharge = true;
-            }
+            // create the object to hold the parameters
+            ClusteringParameters clusteringParameters = new ClusteringParameters();
 
-            String tempFolder = defaultParameters.getBinaryDirectory();
-            if(commandLine.hasOption(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue())){
-                tempFolder = commandLine.getOptionValue(CliOptions.OPTIONS.TEMP_DIRECTORY.getValue());
-            }else{
-                tempFolder = createTempFolderPath(finalResultFile, tempFolder);
-            }
+            // if a config file is set, merge the config file parameters first
+            if (commandLine.hasOption(CliOptions.OPTIONS.CONFIG_FILE.getValue())) {
+                String configFilePath = commandLine.getOptionValue(CliOptions.OPTIONS.CONFIG_FILE.getValue());
 
-            /** Advanced options */
-            // MIN NUMBER OF COMPARISONS
-            int minNumberComparisons = defaultParameters.getMinNumberOfComparisons();
-            if (commandLine.hasOption(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue())) {
-                minNumberComparisons = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.ADVANCED_MIN_NUMBER_COMPARISONS.getValue()));
+                clusteringParameters.mergeParameters(configFilePath);
+                log.info("Configuration loaded from " + configFilePath);
             }
 
-            SpectraPerBinNumberComparisonAssessor numberOfComparisonAssessor = new SpectraPerBinNumberComparisonAssessor(
-                    binnedPrecursorTolerance * 2, minNumberComparisons, BasicIntegerNormalizer.MZ_CONSTANT * 2500
-            );
-
-            int nInitiallySharedPeaks = defaultParameters.getNInitiallySharedPeaks();
+            // merge all other command line parameters overwriting the config file values
+            clusteringParameters.mergeCommandLineArgs(commandLine);
 
-            /** Perform clustering **/
-            LocalDateTime startTime = LocalDateTime.now();
+            // make sure that the set parameters are valid
+            checkParameterValidity(clusteringParameters);
 
-            // set an approximate fragment tolerance for the filters
-            double fragmentTolerance = (fragmentPrecision.equalsIgnoreCase("high")) ? 0.01 : 0.5;
-
-            // set the window size for the noise filter
-            int windowSizeNoiseFilter = (fragmentPrecision.equalsIgnoreCase("high")) ? 3000 : 100;
+            // all remaining parameters are treated as input files
+            String[] peakFiles = commandLine.getArgs();
 
-            // set the appropriate m/z binner
-            IMzBinner mzBinner = (fragmentPrecision.equalsIgnoreCase("high")) ?
-                    new HighResolutionMzBinner() : new TideBinner();
+            if (peakFiles.length < 1) {
+                printUsage();
+                throw new MissingParameterException("Missing input files");
+            }
 
-            IRawSpectrumFunction loadingFilter = new RemoveImpossiblyHighPeaksFunction()
-                    .specAndThen(new RemovePrecursorPeaksFunction(fragmentTolerance))
-                    .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(defaultParameters
-                            .getNumberHigherPeaks())));
+            // create the temporary directory in case it wasn't set
+            if (clusteringParameters.getBinaryDirectory() == null)
+                clusteringParameters.setBinaryDirectory(createTempFolderPath(clusteringParameters.getOutputFile(), "binary-clustering-files"));
 
-            IPropertyStorage localStorage = PropertyStorageFactory
-                    .buildDynamicLevelDBPropertyStorage(new File(tempFolder));
+            // start the clustering
+            runClustering(peakFiles, clusteringParameters);
 
-            File[] inputFiles = Arrays.stream(peakFiles)
-                    .map(File::new)
-                    .toArray(File[]::new);
+            // exit nicely
+            System.exit(0);
+        } catch (MissingParameterException e) {
+            System.out.println("Error: " + e.getMessage() + "\n\n");
+            printUsage();
 
-            // create the comparison predicate for the first round
-            IComparisonPredicate<ICluster> firstRoundPredicate = new ShareNComparisonPeaksPredicate(
-                    nInitiallySharedPeaks);
+            System.exit(1);
+        } catch (Exception e) {
+            e.printStackTrace();
+            System.out.println("Error: " + e.getMessage());
 
-            if (!ignoreCharge) {
-                firstRoundPredicate = new SameChargePredicate().and(firstRoundPredicate);
-            }
+            System.exit(1);
+        }
+    }
 
-            IClusteringEngine engine = new GreedyClusteringEngine(
-                    binnedPrecursorTolerance,
-                    startThreshold, endThreshold, rounds, new CombinedFisherIntensityTest(),
-                    numberOfComparisonAssessor, firstRoundPredicate,
-                    windowSizeNoiseFilter);
+    /**
+     * Run the clustering process using the defined parameters.
+     *
+     * @param clusteringParameters
+     * @throws Exception
+     */
+    private void runClustering(String[] peakFiles, ClusteringParameters clusteringParameters) throws Exception {
+        // create the storage to load the spectra
+        IPropertyStorage propertyStorage = PropertyStorageFactory
+                .buildDynamicLevelDBPropertyStorage(new File(clusteringParameters.getBinaryDirectory()));
 
-            MzSpectraReader reader = new MzSpectraReader( mzBinner, new MaxPeakNormalizer(),
-                    new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), loadingFilter,
-                    GreedyClusteringEngine.COMPARISON_FILTER, engine, inputFiles);
+        File clusterStorageDir = createUniqueDirectory(new File(clusteringParameters.getBinaryDirectory(), "loaded-clusters"));
+        IMapStorage<ICluster> clusterStorage = ClusterStorageFactory.buildTemporaryDynamicStorage(clusterStorageDir, GreedySpectralCluster.class);
 
-            // set the comparison assessor as listener to count the spectra per bin
-            reader.addSpectrumListener(numberOfComparisonAssessor);
+        // load the spectra
+        IClusterProperties[] loadedClusters = loadInputFiles(peakFiles, clusteringParameters, propertyStorage, clusterStorage);
 
-            Iterator<ICluster> iterator = reader.readClusterIterator(localStorage);
+        // run the clustering in parallel
+        File clusteringTmpDir = createUniqueDirectory(new File(clusteringParameters.getBinaryDirectory(), "clustering-files"));
 
-            /**
-             * Todo: We need to review the current implementation because it loads in memory
-             *       all the clusters. If we want to the Ehcache we will need to use Maps and
-             *       all the methods
-             **/
-            List<ICluster> spectra = new ArrayList<>(1_000);
+        IClusterBinner clusterBinner = new SimilarSizedClusterBinner(
+                2 * clusteringParameters.getIntPrecursorTolerance(),
+                1_000, !clusteringParameters.isIgnoreCharge());
 
-            log.debug(String.format("Loading spectra from %d file(s)...", inputFiles.length));
+        LocalParallelBinnedClusteringTool clusteringTool = new LocalParallelBinnedClusteringTool(
+                clusteringParameters.getNThreads(), clusteringTmpDir, clusterBinner, GreedySpectralCluster.class);
 
-            int count = 0;
-            while (iterator.hasNext()) {
-                ICluster cluster = iterator.next();
-                spectra.add(cluster);
-                count++;
-            }
+        LocalDateTime startTime = LocalDateTime.now();
+        log.debug("Starting clustering...");
 
-            LocalDateTime loadingCompleteTime = LocalDateTime.now();
-            log.debug(String.format("Loaded %d spectra in %d seconds", count,
-                    Duration.between(startTime, loadingCompleteTime).getSeconds()));
+        clusteringTool.runClustering(loadedClusters, clusterStorage, clusteringParameters);
 
-            // sort according to m/z
-            spectra.sort(Comparator.comparingInt(ICluster::getPrecursorMz));
+        // some nice output
+        LocalDateTime clusteringCompleteTime = LocalDateTime.now();
+        log.debug(String.format("Clustering completed in in %d seconds",
+                Duration.between(startTime, clusteringCompleteTime).getSeconds()));
 
-            Path thisResult = Paths.get(finalResultFile.getAbsolutePath());
+        log.info("Result file written to " + clusteringParameters.getOutputFile());
 
-            log.debug("Clustering files...");
-            ICluster[] clusters = engine.clusterSpectra(spectra.toArray(new ICluster[spectra.size()]));
+        // TODO: Create .clustering output file
 
-            LocalDateTime clusteringCompleteTime = LocalDateTime.now();
-            log.debug(String.format("Clustering completed in %d seconds", Duration.between(loadingCompleteTime, clusteringCompleteTime).getSeconds()));
+        // close the storage
+        clusterStorage.close();
+        propertyStorage.close();
+    }
 
-            ObjectDBGreedyClusterStorage writer = new ObjectDBGreedyClusterStorage(
-                    new ObjectsDB(finalResultFile.getAbsolutePath(), true));
-            writer.addGreedySpectralClusters(clusters);
-            writer.writeDBMode();
-            writer.flush();
+    /**
+     * Loads all spectra from the defined peak list files as IClusters. The clusters are stored in the defined clusterStorage.
+     * The cluster's properties are stored in the propertyStorage. The cluster's basic properties are returned as an array.
+     *
+     * @param peakFiles Files to load.
+     * @param clusteringParameters Clustering parameters to use.
+     * @param propertyStorage The property storage.
+     * @param clusterStorage Storage to store the clusters in.
+     * @return The IClusterProperties of the loaded clusters.
+     */
+    private IClusterProperties[] loadInputFiles(String[] peakFiles, ClusteringParameters clusteringParameters,
+                                                IPropertyStorage propertyStorage, IMapStorage<ICluster> clusterStorage)
+                                                throws Exception {
+        log.debug(String.format("Loading spectra from %d input files", peakFiles.length));
+        LocalDateTime startTime = LocalDateTime.now();
+
+        // convert the input files to file objects
+        File[] inputFiles = Arrays.stream(peakFiles)
+                .map(File::new)
+                .toArray(File[]::new);
+
+        // load the spectra using an MzSpectraReader
+        MzSpectraReader reader = new MzSpectraReader( clusteringParameters.createMzBinner(), new MaxPeakNormalizer(),
+                new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), clusteringParameters.createLoadingFilter(),
+                GreedyClusteringEngine.COMPARISON_FILTER, clusteringParameters.createGreedyClusteringEngine(), inputFiles);
+
+        // create the iterator to load the clusters
+        Iterator<ICluster> iterator = reader.readClusterIterator(propertyStorage);
+
+        List<IClusterProperties> loadedClusters = new ArrayList<>(1000);
+
+        while (iterator.hasNext()) {
+            ICluster cluster = iterator.next();
+
+            // all clusters are primarily stored in the cluster storage
+            clusterStorage.put(cluster.getId(), cluster);
+
+            // only retain the basic properties
+            loadedClusters.add(cluster.getProperties());
+        }
 
-            log.debug(String.format("Results written in %d seconds",
-                    Duration.between(clusteringCompleteTime, LocalDateTime.now()).getSeconds()));
-            System.out.println("Results written to " + thisResult.toString());
+        // some nice output
+        LocalDateTime loadingCompleteTime = LocalDateTime.now();
+        log.debug(String.format("Loaded %d spectra in %d seconds", loadedClusters.size(),
+                Duration.between(startTime, loadingCompleteTime).getSeconds()));
 
-            log.debug(String.format("Process completed in %d seconds", Duration.between(startTime,
-                    LocalDateTime.now()).getSeconds()));
+        return loadedClusters.toArray(new IClusterProperties[0]);
+    }
 
-            // Close the property storage and delete folders and property files.
-            localStorage.close();
+    /**
+     * Create a new unique directory name. In case the target name already exists, a new directory with the name
+     * {target name}-{N} is created.
+     *
+     * @param targetDirectoryName The target directory name / path.
+     * @return The File object representing the finally created unique directory.
+     * @throws IOException Thrown if creating the directory failed.
+     */
+    private File createUniqueDirectory(File targetDirectoryName) throws IOException {
+        int iteration = 1;
+        String orgPath = targetDirectoryName.getAbsolutePath();
+
+        while (targetDirectoryName.exists()) {
+            targetDirectoryName = new File(orgPath + "-" + String.valueOf(iteration));
+            iteration++;
+        }
 
-            System.exit(0);
-        } catch (MissingParameterException e) {
-            System.out.println("Error: " + e.getMessage() + "\n\n");
-            printUsage();
+        // now that the directory does not yet exist, create it
+        if (!targetDirectoryName.mkdir()) {
+            throw new IOException("Failed to create directory " + targetDirectoryName.getAbsolutePath());
+        }
 
-            System.exit(1);
-        } catch (Exception e) {
-            e.printStackTrace();
-            System.out.println("Error: " + e.getMessage());
+        return targetDirectoryName;
+    }
 
-            System.exit(1);
+    /**
+     * Ensures that the set user parameters are valid. In case they are not, an Exception is thrown.
+     *
+     * @param clusteringParameters The set parameters
+     * @throws Exception In case something is invalid.
+     * @throws MissingParameterException
+     */
+    private void checkParameterValidity(ClusteringParameters clusteringParameters) throws Exception, MissingParameterException {
+        // RESULT FILE PATH
+        if (clusteringParameters.getOutputFile() == null)
+            throw new MissingParameterException("Missing required option " +
+                    CliOptions.OPTIONS.OUTPUT_PATH.getValue());
+
+        // ensure that the output file does not exist
+        if (clusteringParameters.getOutputFile().exists())
+            throw new Exception("Result file " + clusteringParameters.getOutputFile().getAbsolutePath() + " already exists");
+
+        // check whether the fragment tolerance is valid
+        if (!"high".equalsIgnoreCase(clusteringParameters.getFragmentIonPrecision()) &&
+            !"low".equalsIgnoreCase(clusteringParameters.getFragmentIonPrecision())) {
+            throw new Exception("Invalid fragment precision set. Allowed values are 'low' and 'high'");
         }
     }
 
+    /**
+
+     TODO: update print function
     private void printSettings(File finalResultFile, int nMajorPeakJobs, float startThreshold,
                                float endThreshold, int rounds, boolean keepBinaryFiles, File binaryTmpDirectory,
                                String[] peaklistFilenames, boolean reUseBinaryFiles, boolean fastMode,
@@ -278,7 +277,7 @@ private void printSettings(File finalResultFile, int nMajorPeakJobs, float start
         System.out.println("Using fast mode: " + (fastMode ? "yes" : "no"));
 
         System.out.println("\nOther settings:");
-        System.out.println("Precursor tolerance: " + defaultParameters.getPrecursorIonTolerance());
+        System.out.println("Precursor tolerance: " + clusteringParameters.getPrecursorIonTolerance());
         System.out.println("Fragment ion precision: " + fragmentPrecision);
 
         // used filters
@@ -296,7 +295,7 @@ private void printSettings(File finalResultFile, int nMajorPeakJobs, float start
 //            System.out.println("Minimum number of comparisons: " + Defaults.getMinNumberComparisons());
 
         System.out.println();
-    }
+    }*/
 
     private void printUsage() {
         HelpFormatter formatter = new HelpFormatter();
diff --git a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
index f3acaa5..2916df0 100644
--- a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
+++ b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
@@ -75,7 +75,7 @@ private void setProperties(Properties properties) {
         if(properties.containsKey("fragment.precision"))
             this.fragmentIonPrecision = properties.getProperty("fragment.precision");
         if(properties.containsKey("n.threads"))
-            this.nThreads = Integer.parseInt(properties.getProperty("jobs.precision"));
+            this.nThreads = Integer.parseInt(properties.getProperty("n.threads"));
         if(properties.containsKey("threshold.start"))
             this.thresholdStart =  Float.parseFloat(properties.getProperty("threshold.start"));
         if(properties.containsKey("threshold.end"))
diff --git a/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java b/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java
index 5a98978..247d00e 100644
--- a/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java
+++ b/src/test/java/org/spectra/cluster/tools/LocalParallelBinnedClusteringToolTest.java
@@ -24,6 +24,7 @@
 import org.spectra.cluster.normalizer.TideBinner;
 import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate;
 import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+import org.spectra.cluster.util.ClusteringParameters;
 
 import java.io.File;
 import java.net.URI;
@@ -108,11 +109,18 @@ public void testParallelClustering() throws Exception {
 
         File finalResultFile = new File(testDir.toFile(), "result.bcs");
 
-        clusterer.runClustering(testClusters, clusterStorage, finalResultFile,
-            BasicIntegerNormalizer.MZ_CONSTANT,
-            1, 0.99f, 5, new CombinedFisherIntensityTest(),
-            new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5),
-            GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT);
+        ClusteringParameters clusteringParameters = new ClusteringParameters();
+        clusteringParameters.setThresholdStart(1f);
+        clusteringParameters.setThresholdEnd(0.99f);
+        clusteringParameters.setClusterRounds(5);
+        clusteringParameters.setFragmentIonPrecision("high");
+        clusteringParameters.setPrecursorIonTolerance((double) 1);
+        clusteringParameters.setIgnoreCharge(false);
+        clusteringParameters.setNInitiallySharedPeaks(5);
+        clusteringParameters.setNThreads(1);
+        clusteringParameters.setOutputFile(finalResultFile);
+
+        clusterer.runClustering(testClusters, clusterStorage, clusteringParameters);
 
         // make sure the final result file exists
         Assert.assertTrue(finalResultFile.exists());
@@ -131,7 +139,7 @@ public void testParallelClustering() throws Exception {
             totalSpectra += cluster.getClusteredSpectraCount();
         }
 
-        Assert.assertEquals(192, totalClusters);
+        Assert.assertEquals(95, totalClusters);
         Assert.assertEquals(testClusters.length, totalSpectra);
     }
 }

From 77a85ed34a8801e702dc6f4efb80968859c595f4 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Tue, 11 Aug 2020 21:20:02 +0200
Subject: [PATCH 03/13] Added raw top 50 peaks as new spectrum property.

---
 pom.xml                                       |  2 +-
 .../cluster/io/spectra/MzSpectraReader.java   | 54 ++++++++++---------
 .../cluster/io/MzSpectraReaderTest.java       |  2 +-
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/pom.xml b/pom.xml
index 220800c..fa98424 100644
--- a/pom.xml
+++ b/pom.xml
@@ -15,7 +15,7 @@
         <common-collections.version>3.2.2</common-collections.version>
         <commons-math3.version>3.5</commons-math3.version>
         <commons-cli.version>1.2</commons-cli.version>
-        <pgatk-io.version>1.0.0-SNAPSHOT</pgatk-io.version>
+        <pgatk-io.version>1.0.1-SNAPSHOT</pgatk-io.version>
         <slf4j.version>1.7.25</slf4j.version>
         <logback.version>1.2.3</logback.version>
         <guava.version>29.0-jre</guava.version>
diff --git a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
index 09ee80b..8b015c6 100644
--- a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
+++ b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
@@ -1,6 +1,5 @@
 package org.spectra.cluster.io.spectra;
 
-import lombok.extern.slf4j.Slf4j;
 import io.github.bigbio.pgatk.io.clustering.ClusteringFileReader;
 import io.github.bigbio.pgatk.io.common.MzIterableReader;
 import io.github.bigbio.pgatk.io.common.Param;
@@ -11,6 +10,7 @@
 import io.github.bigbio.pgatk.io.objectdb.ObjectsDB;
 import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
 import io.github.bigbio.pgatk.io.properties.StoredProperties;
+import lombok.extern.slf4j.Slf4j;
 import org.spectra.cluster.engine.IClusteringEngine;
 import org.spectra.cluster.exceptions.SpectraClusterException;
 import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction;
@@ -52,6 +52,7 @@
  */
 @Slf4j
 public class MzSpectraReader {
+    private final static IRawPeakFunction top50PeaksFilter = new KeepNHighestRawPeaks(50);
 
     /** Pattern for validating mzML format */
     private static final Pattern mzMLHeaderPattern = Pattern.compile("^[^<]*(<\\?xml [^>]*>\\s*(<!--[^>]*-->\\s*)*)?<(mzML)|(indexedmzML) xmlns=.*", Pattern.MULTILINE);
@@ -303,9 +304,11 @@ public ClusterIteratorConverter<Stream<ITuple>, ICluster> readClusterIterator(IP
                     "Clusters");
 
         return new ClusterIteratorConverter<>(iteratorStream, tupleSpectrum -> {
+            // ignore clusters
             if (tupleSpectrum.getValue() instanceof io.github.bigbio.pgatk.io.common.cluster.ICluster) {
                 return storeCluster(propertyStorage, tupleSpectrum);
             }
+            // create the single spectrum cluster
             return clusteringEngine.createSingleSpectrumCluster(
                     peaksPerMzWindowFilter.apply(storeIBinarySpectrum(propertyStorage, tupleSpectrum)));
         });
@@ -319,30 +322,11 @@ private ICluster storeCluster(IPropertyStorage propertyStorage, ITuple tupleSpec
                 (io.github.bigbio.pgatk.io.common.cluster.ICluster) tupleSpectrum.getValue();
 
         ICluster s = transformIOClusterToCluster(spectrum, clusteringEngine);
-//        // save spectrum properties
-//        if (propertyStorage != null) {
-//            for (Param param: spectrum.getAdditional()) {
-//                propertyStorage.put(s.getUUI(), param.getName(), param.getValue());
-//
-//                // TODO: map the title and retention time from existing cvParams
-//                // current implementation might only work for MGF files.
-//
-//                // TODO: put support for PTMs
-//            }
-//            // always store the original filename
-//            propertyStorage.put(s.getUUI(), StoredProperties.ORG_FILENAME, inputFile.getName());
-//
-//            String spectrumId = spectrum.getObjectId();
-//
-//            // make spectrum id PSI format compatible
-//            if (spectrum instanceof Ms2Query) {
-//                spectrumId = "index=" + spectrumId;
-//            }
-//
-//            propertyStorage.put(s.getUUI(), StoredProperties.FILE_INDEX, spectrumId);
-//            propertyStorage.put(s.getUUI(), StoredProperties.PRECURSOR_MZ, String.valueOf(spectrum.getPrecursorMZ()));
-//            propertyStorage.put(s.getUUI(), StoredProperties.CHARGE, String.valueOf(spectrum.getPrecursorCharge()));
-//        }
+//        // save additional properties
+        if (propertyStorage != null) {
+            // TODO: store additional cluster properties
+            log.warn("Loaded cluster properties are currently not stored.");
+        }
 
         return s;
 
@@ -375,6 +359,11 @@ public SpectrumIteratorConverter<Stream<ITuple>, IBinarySpectrum> readBinarySpec
     }
 
     /**
+     * Stores the spectrum's properties in the property storage and returns
+     * the binary spectrum with defined filters already applied.
+     *
+     * Currently, the precursor m/z is stored in integer space, the peaks are
+     * normalized and the comparison filter is being applied.
      *
      * @param propertyStorage Property Storage
      * @param tupleSpectrum Spectrum Tuple
@@ -385,6 +374,9 @@ private IBinarySpectrum storeIBinarySpectrum(IPropertyStorage propertyStorage, I
         File inputFile = (File) tupleSpectrum.getKey();
         Spectrum spectrum = (Spectrum) tupleSpectrum.getValue();
 
+        // retain the top 50 peaks for later
+        Map<Double, Double> top50Peaks = top50PeaksFilter.apply(spectrum.getPeakList());
+
         // apply the initial loading filter
         if (loadingFilter != null) {
             spectrum = loadingFilter.apply(spectrum);
@@ -419,6 +411,18 @@ private IBinarySpectrum storeIBinarySpectrum(IPropertyStorage propertyStorage, I
             propertyStorage.put(s.getUUI(), StoredProperties.FILE_INDEX, spectrumId);
             propertyStorage.put(s.getUUI(), StoredProperties.PRECURSOR_MZ, String.valueOf(spectrum.getPrecursorMZ()));
             propertyStorage.put(s.getUUI(), StoredProperties.CHARGE, String.valueOf(spectrum.getPrecursorCharge()));
+
+            // save the original peaklist
+            StringBuilder mzValues = new StringBuilder(50);
+            StringBuilder intensValues = new StringBuilder(50);
+
+            for (Double mz : top50Peaks.keySet()) {
+                mzValues.append(String.format("%.4f", mz));
+                intensValues.append(String.format("%.4f", top50Peaks.get(mz)));
+            }
+
+            propertyStorage.put(s.getUUI(), StoredProperties.ORIGINAL_PEAKS_MZ, mzValues.toString());
+            propertyStorage.put(s.getUUI(), StoredProperties.ORIGINAL_PEAKS_INTENS, intensValues.toString());
         }
 
         // call the listeners
diff --git a/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java b/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java
index f51253a..e58c4fa 100644
--- a/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java
+++ b/src/test/java/org/spectra/cluster/io/MzSpectraReaderTest.java
@@ -117,7 +117,7 @@ public void testPropertyLoading() throws Exception {
 
         Assert.assertEquals(136, nIdentified);
 
-        Assert.assertEquals(7, storage.getAvailableProperties().size());
+        Assert.assertEquals(9, storage.getAvailableProperties().size());
     }
 
     @Test

From 93a601ac4ad3f857f3e724504f0aad7eb76790ea Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Tue, 11 Aug 2020 21:54:54 +0200
Subject: [PATCH 04/13] Added ConsensusPeak class.

Simple class to keep track of peaks
in double space.

Supports averaging of neighbouring
peaks.
---
 .../cluster/consensus/ConsensusPeak.java      | 55 +++++++++++++++++++
 .../cluster/consensus/TestConsensusPeak.java  | 26 +++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
 create mode 100644 src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java

diff --git a/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
new file mode 100644
index 0000000..9ebda55
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
@@ -0,0 +1,55 @@
+package org.spectra.cluster.consensus;
+
+import lombok.Data;
+
+/**
+ * A peak in a consensus spectrum (in Double space) that
+ * keeps track of its count and supports merging of peaks.
+ */
+@Data
+public class ConsensusPeak {
+    private Double mz;
+    private Double intensity;
+    private int count;
+
+    /**
+     * Initializes a new ConsensusPeak
+     */
+    public ConsensusPeak(Double mz, Double intensity) {
+        this.mz = mz;
+        this.intensity = intensity;
+        this.count = 1;
+    }
+
+    public Double getMz() {
+        return mz;
+    }
+
+    public Double getIntensity() {
+        return intensity;
+    }
+
+    /**
+     * Merge a peak. Both, m/z and intensity are
+     * averaged weighted based on the respective
+     * counts.
+     * @param peak The peak to merge.
+     */
+    public void mergePeak(ConsensusPeak peak) {
+        // calculate the weight first
+        int totalCount = this.count + peak.count;
+        double thisWeight = this.count / (double) totalCount;
+        double peakWeight = peak.count / (double) totalCount;
+
+        // calculate the new m/z and intensities
+        double newMz = this.mz * thisWeight + peak.mz * peakWeight;
+        double newIntensity = this.intensity * thisWeight + peak.intensity * peakWeight;
+
+        // update this variables
+        this.mz = newMz;
+        this.intensity = newIntensity;
+
+        // update the count
+        this.count = totalCount;
+    }
+}
diff --git a/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java b/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java
new file mode 100644
index 0000000..12b500f
--- /dev/null
+++ b/src/test/java/org/spectra/cluster/consensus/TestConsensusPeak.java
@@ -0,0 +1,26 @@
+package org.spectra.cluster.consensus;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestConsensusPeak {
+    @Test
+    public void basicConsensusPeakTest() {
+        ConsensusPeak p1 = new ConsensusPeak(1.0, 1.0);
+        ConsensusPeak p2 = new ConsensusPeak(2.0, 0.5);
+
+        p1.mergePeak(p2);
+
+        Assert.assertEquals(1.5, p1.getMz(), 0);
+        Assert.assertEquals(0.75, p1.getIntensity(), 0);
+        Assert.assertEquals(2, p1.getCount());
+
+        ConsensusPeak p3 = new ConsensusPeak(3.0, 4.0);
+
+        p1.mergePeak(p3);
+
+        Assert.assertEquals(2.0, p1.getMz(), 0);
+        Assert.assertEquals(1.833, p1.getIntensity(), 0.001);
+        Assert.assertEquals(3, p1.getCount());
+    }
+}

From b3ab4af2ec78d1b041009ce59f18fb0168c7aa71 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Sun, 16 Aug 2020 21:49:56 +0200
Subject: [PATCH 05/13] Added ConsensusSpectrum interface and helper functions.

---
 .../AbstractConsensusSpectrumBuilder.java     | 118 ++++++++++++++++++
 .../consensus/IConsensusSpectrumBuilder.java  |  24 ++++
 .../TestAbstractConsensusBuilder.java         |  63 ++++++++++
 3 files changed, 205 insertions(+)
 create mode 100644 src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
 create mode 100644 src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java
 create mode 100644 src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java

diff --git a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
new file mode 100644
index 0000000..4437165
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
@@ -0,0 +1,118 @@
+package org.spectra.cluster.consensus;
+
+import io.github.bigbio.pgatk.io.common.PgatkIOException;
+import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.StoredProperties;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.normalizer.IIntensityNormalizer;
+import org.spectra.cluster.normalizer.MaxPeakNormalizer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * Collection of commonly required functions for all
+ * consensus spectrum builders.
+ */
+public abstract class AbstractConsensusSpectrumBuilder implements IConsensusSpectrumBuilder {
+    private static final IIntensityNormalizer intensityNormalizer = new MaxPeakNormalizer();
+
+    @Override
+    public abstract Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage);
+
+    // TODO: Load original scores based on https://github.com/bigbio/pia/blob/master/src/main/java/de/mpc/pia/modeller/score/ScoreModelEnum.javay
+
+    /**
+     * Merge consensus peaks that are within the specified m/z tolerance.
+     *
+     * Similar peaks are merged using the ConsensusPeak::mergePeak function where
+     * the weighted average m/z and weighted average intensity is used.
+     *
+     * @param peaks The peaks to process.
+     * @param mzTolerance The tolerance within which to merge peaks.
+     * @return A list of merged peaks.
+     */
+    static protected List<ConsensusPeak> mergeConsensusPeaks(List<ConsensusPeak> peaks, double mzTolerance) {
+        // sort the peaks according to m/z
+        peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz));
+
+        // iterate over the peaks
+        List<ConsensusPeak> sortedPeaks = new ArrayList<>(peaks.size());
+        ConsensusPeak lastPeak = null;
+
+        for (ConsensusPeak currentPeak : peaks) {
+            // if within the tolerance, merge into the last peak
+            if (lastPeak != null && currentPeak.getMz() - lastPeak.getMz() <= mzTolerance) {
+                lastPeak.mergePeak(currentPeak);
+            } else {
+                // add as new peak
+                sortedPeaks.add(currentPeak);
+                lastPeak = currentPeak;
+            }
+        }
+
+        return sortedPeaks;
+    }
+
+    /**
+     * Loads all original spectra from the property storage and returns them as one
+     * crowded peak list.
+     *
+     * @param cluster The cluster to load the spectra for.
+     * @param propertyStorage The spectra's property storage.
+     * @param normalizeIntensity If set, the intensity is automatically normalized based on the hightest peak intensity.
+     * @return A list of ConsensusPeakS
+     * @throws PgatkIOException If the loading of properties failed.
+     */
+    static protected List<ConsensusPeak> loadOriginalPeaks(ICluster cluster, IPropertyStorage propertyStorage,
+                                                           boolean normalizeIntensity) throws PgatkIOException  {
+        // load a max of 50 peaks per spectrum
+        List<ConsensusPeak> peaks = new ArrayList<>(cluster.getClusteredSpectraCount() * 50);
+
+        for (String spectrumId : cluster.getClusteredSpectraIds()) {
+            // load the m/z and intensity values
+            String mzString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_MZ);
+            String intensityString = propertyStorage.get(spectrumId, StoredProperties.ORIGINAL_PEAKS_INTENS);
+
+            double[] mz = convertDoubleString(mzString);
+            double[] intens = convertDoubleString(intensityString);
+
+            // sanity check
+            if (mz.length != intens.length) {
+                throw new PgatkIOException("Number of m/z and intensity values differs for spectrum " + spectrumId);
+            }
+
+            // normalize the intensity
+            if (normalizeIntensity) {
+                // convert to double list
+                List<Double> intensList = Arrays.stream(intens).boxed().collect(Collectors.toList());
+
+                // normalize
+                intens = Arrays.stream(intensityNormalizer.binDoubles(intensList))
+                        .mapToDouble(Double::new).toArray();
+            }
+
+            for (int i = 0; i < mz.length; i++) {
+                peaks.add(new ConsensusPeak(mz[i], intens[i]));
+            }
+        }
+
+        return peaks;
+    }
+
+    /**
+     * Convert a string of "," delimited double values into an
+     * array of doubles.
+     * @param valueString The string to convert.
+     * @return An array of doubles.
+     */
+    static private double[] convertDoubleString(String valueString) {
+        return Arrays.stream(valueString.split(","))
+                .mapToDouble(Double::valueOf)
+                .toArray();
+    }
+}
diff --git a/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java
new file mode 100644
index 0000000..662aa9d
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/consensus/IConsensusSpectrumBuilder.java
@@ -0,0 +1,24 @@
+package org.spectra.cluster.consensus;
+
+import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import org.spectra.cluster.model.cluster.ICluster;
+
+/**
+ * Describes classes that create consensus spectra
+ * based on a list of clusters.
+ *
+ * @author jg
+ */
+public interface IConsensusSpectrumBuilder {
+    /**
+     * Creates a consensus spectrum for the defined cluster. Some consensus
+     * spectrum building approaches require additional information that is
+     * retrieved from the IPropertyStorage.
+     *
+     * @param cluster The cluster to build the consensus spectrum for.
+     * @param spectrumPropertyStorage The property storage holding the spectra's additional properties.
+     * @return The consensus spectrum.
+     */
+    public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage);
+}
diff --git a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
new file mode 100644
index 0000000..0ac5dae
--- /dev/null
+++ b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
@@ -0,0 +1,63 @@
+package org.spectra.cluster.consensus;
+
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage;
+import org.junit.Assert;
+import org.junit.Test;
+import org.spectra.cluster.cdf.MinNumberComparisonsAssessor;
+import org.spectra.cluster.engine.GreedyClusteringEngine;
+import org.spectra.cluster.engine.IClusteringEngine;
+import org.spectra.cluster.io.spectra.MzSpectraReader;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum;
+import org.spectra.cluster.normalizer.BasicIntegerNormalizer;
+import org.spectra.cluster.normalizer.MaxPeakNormalizer;
+import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate;
+import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Objects;
+
+public class TestAbstractConsensusBuilder {
+    @Test
+    public void testLoadOriginalPeaks() throws Exception {
+        // open the file
+        URI uri = Objects.requireNonNull(getClass().getClassLoader().getResource("single-spectra.mgf")).toURI();
+        File mgfFile = new File(uri);
+
+        IClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT,
+                1, 0.99f, 5, new CombinedFisherIntensityTest(),
+                new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5),
+                GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT);
+
+        MzSpectraReader spectraReader = new MzSpectraReader(mgfFile, GreedyClusteringEngine.COMPARISON_FILTER, engine);
+
+        IPropertyStorage propertyStorage = new InMemoryPropertyStorage();
+
+        // read the spectra
+        Iterator<ICluster> clusterIterator = spectraReader.readClusterIterator(propertyStorage);
+        List<ICluster> clusters = new ArrayList<>(20);
+
+        while (clusterIterator.hasNext()) {
+            clusters.add(clusterIterator.next());
+        }
+
+        Assert.assertEquals(2, clusters.size());
+
+        // get the original peaks
+        for (ICluster cluster : clusters) {
+            List<ConsensusPeak> rawPeaks = AbstractConsensusSpectrumBuilder.loadOriginalPeaks(cluster, propertyStorage, true);
+
+            Assert.assertEquals(50, rawPeaks.size());
+
+            // max peak must always be the same
+            double maxValue = rawPeaks.stream().mapToDouble(ConsensusPeak::getIntensity).max().getAsDouble();
+
+            Assert.assertEquals(MaxPeakNormalizer.MAX_INTENSITY, maxValue, 0);
+        }
+    }
+}

From f37fe15ea4c6dc281ec13364504fce8e18ce2cee Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Sun, 16 Aug 2020 22:04:06 +0200
Subject: [PATCH 06/13] Added test for mergeConsensusPeaks

---
 .../AbstractConsensusSpectrumBuilder.java     |  5 ++-
 .../TestAbstractConsensusBuilder.java         | 35 +++++++++++++++----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
index 4437165..743c1d5 100644
--- a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
+++ b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
@@ -30,7 +30,10 @@ public abstract class AbstractConsensusSpectrumBuilder implements IConsensusSpec
      * Merge consensus peaks that are within the specified m/z tolerance.
      *
      * Similar peaks are merged using the ConsensusPeak::mergePeak function where
-     * the weighted average m/z and weighted average intensity is used.
+     * the weighted average m/z and weighted average intensity is used. No incremental
+     * threshold is applied.
+     *
+     * Warning: This function alters the originally passed objects.
      *
      * @param peaks The peaks to process.
      * @param mzTolerance The tolerance within which to merge peaks.
diff --git a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
index 0ac5dae..19497f2 100644
--- a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
+++ b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
@@ -17,14 +17,11 @@
 
 import java.io.File;
 import java.net.URI;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
+import java.util.stream.Collectors;
 
 public class TestAbstractConsensusBuilder {
-    @Test
-    public void testLoadOriginalPeaks() throws Exception {
+    private List<ICluster> loadClusters(IPropertyStorage propertyStorage) throws Exception {
         // open the file
         URI uri = Objects.requireNonNull(getClass().getClassLoader().getResource("single-spectra.mgf")).toURI();
         File mgfFile = new File(uri);
@@ -36,7 +33,6 @@ public void testLoadOriginalPeaks() throws Exception {
 
         MzSpectraReader spectraReader = new MzSpectraReader(mgfFile, GreedyClusteringEngine.COMPARISON_FILTER, engine);
 
-        IPropertyStorage propertyStorage = new InMemoryPropertyStorage();
 
         // read the spectra
         Iterator<ICluster> clusterIterator = spectraReader.readClusterIterator(propertyStorage);
@@ -48,6 +44,14 @@ public void testLoadOriginalPeaks() throws Exception {
 
         Assert.assertEquals(2, clusters.size());
 
+        return clusters;
+    }
+
+    @Test
+    public void testLoadOriginalPeaks() throws Exception {
+        IPropertyStorage propertyStorage = new InMemoryPropertyStorage();
+        List<ICluster> clusters = loadClusters(propertyStorage);
+
         // get the original peaks
         for (ICluster cluster : clusters) {
             List<ConsensusPeak> rawPeaks = AbstractConsensusSpectrumBuilder.loadOriginalPeaks(cluster, propertyStorage, true);
@@ -60,4 +64,21 @@ public void testLoadOriginalPeaks() throws Exception {
             Assert.assertEquals(MaxPeakNormalizer.MAX_INTENSITY, maxValue, 0);
         }
     }
+
+    @Test
+    public void testMergePeaks() throws Exception {
+        ConsensusPeak p1 = new ConsensusPeak(10.0, 1.0);
+        ConsensusPeak p2 = new ConsensusPeak(10.1, 2.0);
+        ConsensusPeak p3 = new ConsensusPeak(11.0, 1.0);
+
+        List<ConsensusPeak> peaks = Arrays.stream(new ConsensusPeak[]{p1, p2, p3}).collect(Collectors.toList());
+
+        List<ConsensusPeak> mergedPeaks = AbstractConsensusSpectrumBuilder.mergeConsensusPeaks(peaks, 0.5);
+
+        Assert.assertEquals(2, mergedPeaks.size());
+        Assert.assertEquals(10.05, mergedPeaks.get(0).getMz(), 0);
+        Assert.assertEquals(2, mergedPeaks.get(0).getCount());
+        Assert.assertEquals(1.5, mergedPeaks.get(0).getIntensity(), 0);
+        Assert.assertEquals(1, mergedPeaks.get(1).getCount());
+    }
 }

From 15513eacd55104cf2116692caa8a63e7bde8f463 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Thu, 27 Aug 2020 13:00:28 +0200
Subject: [PATCH 07/13] Added ConsensusPeak constructor to initially set the
 count

---
 .../org/spectra/cluster/consensus/ConsensusPeak.java     | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
index 9ebda55..7995c68 100644
--- a/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
+++ b/src/main/java/org/spectra/cluster/consensus/ConsensusPeak.java
@@ -21,6 +21,15 @@ public ConsensusPeak(Double mz, Double intensity) {
         this.count = 1;
     }
 
+    /**
+     * Initializes a new ConsensusPeak setting the count
+     */
+    public ConsensusPeak(Double mz, Double intensity, int count) {
+        this.mz = mz;
+        this.intensity = intensity;
+        this.count = count;
+    }
+
     public Double getMz() {
         return mz;
     }

From f896e74543b96106c0ddcf414fc218d615712a03 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Thu, 27 Aug 2020 14:08:55 +0200
Subject: [PATCH 08/13] AbstractConsensusSpectrumBuilder: Added function to
 retriev average precursor m/z

---
 .../AbstractConsensusSpectrumBuilder.java     | 22 +++++++++++++++++++
 .../TestAbstractConsensusBuilder.java         | 13 +++++++++++
 2 files changed, 35 insertions(+)

diff --git a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
index 743c1d5..c0d34e2 100644
--- a/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
+++ b/src/main/java/org/spectra/cluster/consensus/AbstractConsensusSpectrumBuilder.java
@@ -118,4 +118,26 @@ static private double[] convertDoubleString(String valueString) {
                 .mapToDouble(Double::valueOf)
                 .toArray();
     }
+
+    /**
+     * Get the average precursor m/z of all contained spectra
+     * within the cluster based on the original spectra's precursor m/z.
+     *
+     * @param cluster The cluster
+     * @param propertyStorage Property storage of the spectra's properties
+     * @return The precursor m/z as double
+     * @throws IllegalStateException In case the property storage is not accessible.
+     */
+    static protected Double getAveragePrecursorMz(ICluster cluster, IPropertyStorage propertyStorage) {
+        return cluster.getClusteredSpectraIds().stream()
+            .mapToDouble(id -> {
+                try {
+                    return Double.parseDouble(propertyStorage.get(id, StoredProperties.PRECURSOR_MZ));
+                }
+                catch (Exception e) {
+                    throw new IllegalStateException(e);
+                }
+            })
+            .average().getAsDouble();
+    }
 }
diff --git a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
index 19497f2..82fe678 100644
--- a/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
+++ b/src/test/java/org/spectra/cluster/consensus/TestAbstractConsensusBuilder.java
@@ -65,6 +65,19 @@ public void testLoadOriginalPeaks() throws Exception {
         }
     }
 
+    @Test
+    public void testAveragePrecursorMz () throws Exception {
+        IPropertyStorage propertyStorage = new InMemoryPropertyStorage();
+        List<ICluster> clusters = loadClusters(propertyStorage);
+
+        // get the average precuror m/z for every cluster
+        for (ICluster cluster : clusters) {
+            Double averageMz = AbstractConsensusSpectrumBuilder.getAveragePrecursorMz(cluster, propertyStorage);
+
+            Assert.assertEquals(400.29, averageMz, 0.01);
+        }
+    }
+
     @Test
     public void testMergePeaks() throws Exception {
         ConsensusPeak p1 = new ConsensusPeak(10.0, 1.0);

From f9174eff2c62132ae5974aae74b904d3662668f1 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Thu, 27 Aug 2020 14:14:58 +0200
Subject: [PATCH 09/13] Added AverageConsensusSpectrumBuilder

---
 .../AverageConsensusSpectrumBuilder.java      | 124 ++++++++++++++++++
 .../AverageConsensusSpectrumBuilderTest.java  |  70 ++++++++++
 .../io/MapClusterStorageBenchmarkTest.java    |  14 +-
 3 files changed, 197 insertions(+), 11 deletions(-)
 create mode 100644 src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java
 create mode 100644 src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java

diff --git a/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java b/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java
new file mode 100644
index 0000000..f223a82
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilder.java
@@ -0,0 +1,124 @@
+package org.spectra.cluster.consensus;
+
+import io.github.bigbio.pgatk.io.common.DefaultSpectrum;
+import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import lombok.Data;
+import lombok.extern.slf4j.Slf4j;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.util.ClusteringParameters;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * ConsensusSpectrumBuilder that is similar to the consensus
+ * spectrum builder used by the original spectra-cluster 1.0
+ * algorithm.
+ *
+ * @author jg
+ */
+@Slf4j
+@Data
+public class AverageConsensusSpectrumBuilder extends AbstractConsensusSpectrumBuilder {
+    private final ClusteringParameters clusteringParameters;
+
+    @Override
+    public Spectrum createConsensusSpectrum(ICluster cluster, IPropertyStorage spectrumPropertyStorage) {
+        try {
+            // load all peaks
+            List<ConsensusPeak> peaks = loadOriginalPeaks(cluster, spectrumPropertyStorage, true);
+
+            // merge the peaks
+            double fragmentTolerance = (clusteringParameters.getFragmentIonPrecision().equalsIgnoreCase("high") ? 0.01 : 0.5);
+            double mzThresholdStep = fragmentTolerance / 5;
+
+            // merge similar peaks with increasing tolerance
+            for (double currentTolerance = mzThresholdStep; currentTolerance < fragmentTolerance; currentTolerance += mzThresholdStep) {
+                peaks = mergeConsensusPeaks(peaks, currentTolerance);
+            }
+
+            // adapt peak intensities based on the probability that they are observed
+            List<ConsensusPeak> adaptedIntensityPeaks = adaptPeakIntensities(peaks, cluster.getClusteredSpectraCount());
+
+            // apply the noise filter
+            List<ConsensusPeak> filteredPeaks = filterNoise(adaptedIntensityPeaks, 100.0, 5, 20);
+
+            // create the peak list as Map<Double, Double>
+            Map<Double, Double> consensusPeaks = new HashMap<>(filteredPeaks.size());
+
+            for (ConsensusPeak p : filteredPeaks)
+                consensusPeaks.put(p.getMz(), p.getIntensity());
+
+            // create the spectrum object
+            return new DefaultSpectrum(
+                    UUID.randomUUID().toString(), // create a new unique id
+                    1L,
+                    cluster.getPrecursorCharge(),
+                    getAveragePrecursorMz(cluster, spectrumPropertyStorage),
+                    1.0,
+                    consensusPeaks,
+                    2,
+                    Collections.emptyList());
+        } catch (Exception e) {
+            log.error(("Failed to build consensus spectrum: " + e.toString()));
+            return null;
+        }
+    }
+
+    /**
+     * Adapt the peak intensities in consensusPeaks using the following formula:
+     * I = I * (0.95 + 0.05 * (1 + pi))^5
+     * where pi is the peaks probability
+     */
+    protected static List<ConsensusPeak> adaptPeakIntensities(List<ConsensusPeak> peaks, int nSpectra) {
+        List<ConsensusPeak> adaptedPeaks = new ArrayList<ConsensusPeak>(peaks);
+
+        for (int i = 0; i < adaptedPeaks.size(); i++) {
+            ConsensusPeak peak = adaptedPeaks.get(i);
+            double peakProbability = (double) peak.getCount() / (double) nSpectra;
+            double newIntensity = peak.getIntensity() * (0.95 + 0.05 * Math.pow(1 + peakProbability, 5));
+
+            adaptedPeaks.set(i, new ConsensusPeak(peak.getMz(), newIntensity, peak.getCount()));
+        }
+
+        return adaptedPeaks;
+    }
+
+    /**
+     * Filters the consensus spectrum keeping only the top N peaks per M m/z
+     *
+     * @param peaks The peaks to filter
+     * @param mzWindowSize The window size to use to filter peaks in m/z
+     * @param maxPeaksPerWindow Number of peaks per m/z window
+     * @param minTotalPeaks Minimum number of total peaks before filtering is done
+     */
+    protected static List<ConsensusPeak> filterNoise(List<ConsensusPeak> peaks, double mzWindowSize, int maxPeaksPerWindow, int minTotalPeaks) {
+        if (peaks.size() <= minTotalPeaks)
+            return peaks;
+
+        // under certain conditions (averaging m/z values) the order of peaks can be disrupted
+        peaks.sort(Comparator.comparingDouble(ConsensusPeak::getMz));
+
+        List<ConsensusPeak> filteredPeaks = new ArrayList<>(peaks);
+        double maxMz = peaks.stream().mapToDouble(ConsensusPeak::getMz).max().getAsDouble();
+
+        for (double minMz = 0; minMz <= maxMz; minMz += mzWindowSize) {
+            final double lowerWindowMz = minMz;
+
+            // extract all peaks
+            List<ConsensusPeak> filteredWindowPeaks = peaks.stream()
+                    // keep peaks within the window
+                    .filter(p -> p.getMz() >= lowerWindowMz && p.getMz() < lowerWindowMz + mzWindowSize)
+                    // sort based on intensity
+                    .sorted(Comparator.comparingDouble(ConsensusPeak::getIntensity).reversed())
+                    // retain max N peaks
+                    .limit(maxPeaksPerWindow)
+                    .collect(Collectors.toList());
+
+            filteredPeaks.addAll(filteredWindowPeaks);
+        }
+
+        return filteredPeaks;
+    }
+}
diff --git a/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java b/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java
new file mode 100644
index 0000000..6887231
--- /dev/null
+++ b/src/test/java/org/spectra/cluster/consensus/AverageConsensusSpectrumBuilderTest.java
@@ -0,0 +1,70 @@
+package org.spectra.cluster.consensus;
+
+import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage;
+import org.junit.Assert;
+import org.junit.Test;
+import org.spectra.cluster.cdf.MinNumberComparisonsAssessor;
+import org.spectra.cluster.engine.GreedyClusteringEngine;
+import org.spectra.cluster.engine.IClusteringEngine;
+import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction;
+import org.spectra.cluster.io.spectra.MzSpectraReader;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum;
+import org.spectra.cluster.normalizer.BasicIntegerNormalizer;
+import org.spectra.cluster.normalizer.MaxPeakNormalizer;
+import org.spectra.cluster.normalizer.TideBinner;
+import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate;
+import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+import org.spectra.cluster.util.ClusteringParameters;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+public class AverageConsensusSpectrumBuilderTest {
+    @Test
+    public void testAverageSpectrum() throws Exception {
+        ClusteringParameters params = new ClusteringParameters();
+        IConsensusSpectrumBuilder builder = new AverageConsensusSpectrumBuilder(params);
+
+        // load the spectra
+        File mgfFile = new File(getClass().getClassLoader().getResource("imp_single_cluster.mgf").toURI());
+        IPropertyStorage localStorage = new InMemoryPropertyStorage();
+        MzSpectraReader reader = new MzSpectraReader(mgfFile, new TideBinner(), new MaxPeakNormalizer(),
+                new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), params.createLoadingFilter(),
+                GreedyClusteringEngine.COMPARISON_FILTER, params.createGreedyClusteringEngine());
+
+        Iterator<ICluster> iterator = reader.readClusterIterator(localStorage);
+        List<ICluster> spectra = new ArrayList<>(1_000);
+
+        while (iterator.hasNext()) {
+            spectra.add(iterator.next());
+        }
+
+        // sort according to m/z
+        spectra.sort(Comparator.comparingInt(ICluster::getPrecursorMz));
+
+        // cluster everything
+        IClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT,
+                1, 0.99f, 5, new CombinedFisherIntensityTest(),
+                new MinNumberComparisonsAssessor(10_000), new ShareHighestPeaksClusterPredicate(5),
+                GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT);
+
+        ICluster[] clusters = engine.clusterSpectra(spectra.toArray(new ICluster[spectra.size()]));
+
+        Assert.assertEquals(1, clusters.length);
+
+        // create the consensus spectrum
+        Spectrum consensusSpectrum = builder.createConsensusSpectrum(clusters[0], localStorage);
+
+        Assert.assertNotNull(consensusSpectrum);
+        Assert.assertEquals(2, (int) consensusSpectrum.getPrecursorCharge());
+        Assert.assertEquals(2, (int) consensusSpectrum.getMsLevel());
+        Assert.assertEquals(69, consensusSpectrum.getPeakList().size());
+        Assert.assertEquals(402.717, consensusSpectrum.getPrecursorMZ(), 0.001);
+    }
+}
diff --git a/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java b/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java
index bf6bb9c..02f14df 100644
--- a/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java
+++ b/src/test/java/org/spectra/cluster/io/MapClusterStorageBenchmarkTest.java
@@ -112,12 +112,8 @@ public void storeBigClusterStatic() throws IOException, SpectraClusterException,
 
         time = System.currentTimeMillis();
         IntStream.range(0, MAX_READING_VALUE).forEach(x -> {
-            try {
-                int key = random.nextInt(clusters.length);
-                ICluster value = clusterStorage.get(clusters[key].getId() + String.valueOf(x));
-            }catch (PgatkIOException ex){
-                log.error("Error reading entry -- " + x);
-            }
+            int key = random.nextInt(clusters.length);
+            ICluster value = clusterStorage.get(clusters[key].getId() + String.valueOf(x));
         });
 
         System.out.println("ChronicleMap: Reading 200'000 Clusters -- " + (System.currentTimeMillis() - time) / 1000);
@@ -152,11 +148,7 @@ public void storeBigClusterDynamic() throws IOException, SpectraClusterException
 
         time = System.currentTimeMillis();
         IntStream.range(0, MAX_READING_VALUE).forEach(x -> {
-            try {
-                ICluster value = clusterStorage.get(clusters[0].getId() +  "-" + String.valueOf(x));
-            }catch (PgatkIOException ex){
-                log.error("Error reading entry -- " + x);
-            }
+            ICluster value = clusterStorage.get(clusters[0].getId() +  "-" + String.valueOf(x));
         });
 
         System.out.println("Sparkey: Reading 200'000 Clusters -- " + (System.currentTimeMillis() - time) / 1000);

From f9f0505a8bb3c86c03d48ffd73a73ccee9487d54 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Thu, 27 Aug 2020 14:15:28 +0200
Subject: [PATCH 10/13] Updated MzSpectraReader to store original top50 peaks.

---
 .../spectra/cluster/io/spectra/MzSpectraReader.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
index 8b015c6..5db649f 100644
--- a/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
+++ b/src/main/java/org/spectra/cluster/io/spectra/MzSpectraReader.java
@@ -416,7 +416,17 @@ private IBinarySpectrum storeIBinarySpectrum(IPropertyStorage propertyStorage, I
             StringBuilder mzValues = new StringBuilder(50);
             StringBuilder intensValues = new StringBuilder(50);
 
+            boolean isFirst = true;
+
             for (Double mz : top50Peaks.keySet()) {
+                // add the delimiter
+                if (!isFirst) {
+                    mzValues.append(",");
+                    intensValues.append(",");
+                } else {
+                    isFirst = false;
+                }
+
                 mzValues.append(String.format("%.4f", mz));
                 intensValues.append(String.format("%.4f", top50Peaks.get(mz)));
             }

From d9b304301c9119a6aafd8350b51fedb9401fff0a Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Thu, 27 Aug 2020 14:23:46 +0200
Subject: [PATCH 11/13] Adapted ChronicleMapClusterStorage and
 SparkKeyClusterStorage for new pgtk-io version.

---
 .../io/cluster/ChronicleMapClusterStorage.java      |  6 +++---
 .../cluster/io/cluster/SparkKeyClusterStorage.java  | 13 ++++++++-----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java b/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java
index 9515e69..18fee9e 100644
--- a/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java
+++ b/src/main/java/org/spectra/cluster/io/cluster/ChronicleMapClusterStorage.java
@@ -1,10 +1,10 @@
 package org.spectra.cluster.io.cluster;
 
+import io.github.bigbio.pgatk.io.common.PgatkIOException;
+import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
 import lombok.extern.slf4j.Slf4j;
 import net.openhft.chronicle.map.ChronicleMap;
 import net.openhft.chronicle.map.ChronicleMapBuilder;
-import io.github.bigbio.pgatk.io.common.PgatkIOException;
-import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
 import org.spectra.cluster.model.cluster.ICluster;
 
 import java.io.File;
@@ -120,7 +120,7 @@ public void put(String key, ICluster cluster) {
     }
 
     @Override
-    public ICluster get(String key) throws PgatkIOException {
+    public ICluster get(String key) {
         return this.clusterStorage.get(key);
     }
 
diff --git a/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java b/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java
index 499210f..512624d 100644
--- a/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java
+++ b/src/main/java/org/spectra/cluster/io/cluster/SparkKeyClusterStorage.java
@@ -6,6 +6,7 @@
 import com.spotify.sparkey.SparkeyWriter;
 import io.github.bigbio.pgatk.io.common.PgatkIOException;
 import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
+import lombok.extern.slf4j.Slf4j;
 import org.spectra.cluster.exceptions.SpectraClusterException;
 import org.spectra.cluster.model.cluster.GreedySpectralCluster;
 import org.spectra.cluster.model.cluster.ICluster;
@@ -18,6 +19,7 @@
 import java.util.HashSet;
 import java.util.concurrent.atomic.AtomicLong;
 
+@Slf4j
 public class SparkKeyClusterStorage implements IMapStorage<ICluster> {
 
     private final boolean deleteOnClose;
@@ -96,17 +98,17 @@ private ICluster deserialize(byte[] bytes) throws PgatkIOException {
     }
 
     @Override
-    public synchronized void put(String key, ICluster cluster) throws PgatkIOException{
+    public synchronized void put(String key, ICluster cluster) {
         try {
             writer.put( serialize(key), cluster.toBytes());
         }catch (IOException | SpectraClusterException ex){
-            throw new PgatkIOException("Error wiring the following property - " + key + " " + cluster.getId() + "error " + ex.getMessage());
+            throw new IllegalStateException("Error wiring the following property - " + key + " " + cluster.getId() + "error " + ex.getMessage());
         }
         entryCounter.incrementAndGet();
     }
 
     @Override
-    public ICluster get(String key) throws PgatkIOException {
+    public ICluster get(String key) {
         try {
             byte[] byteObject = readers.get().getAsByteArray(serialize(key));
 
@@ -115,8 +117,9 @@ public ICluster get(String key) throws PgatkIOException {
             }
 
             return deserialize(byteObject);
-        } catch (IOException ex) {
-            throw new PgatkIOException("Error retrieving the value for key -- " + key );
+        } catch (PgatkIOException | IOException ex) {
+            log.error("Error retrieving the value for key -- " + key );
+            return null;
         }
     }
 

From 94c21e16b6c2937baaa25064db2e2407f2b19322 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Sat, 29 Aug 2020 21:40:50 +0200
Subject: [PATCH 12/13] Added infrastructure to write clustering results in
 different formats.

First format supported is MSP.
---
 pom.xml                                       |   7 +
 .../io/result/IClusteringResultWriter.java    |  24 ++
 .../spectra/cluster/io/result/MspWriter.java  | 221 ++++++++++++++++++
 .../cluster/io/result/MspWriterTest.java      | 127 ++++++++++
 4 files changed, 379 insertions(+)
 create mode 100644 src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java
 create mode 100644 src/main/java/org/spectra/cluster/io/result/MspWriter.java
 create mode 100644 src/test/java/org/spectra/cluster/io/result/MspWriterTest.java

diff --git a/pom.xml b/pom.xml
index fa98424..42c6d2f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -19,6 +19,7 @@
         <slf4j.version>1.7.25</slf4j.version>
         <logback.version>1.2.3</logback.version>
         <guava.version>29.0-jre</guava.version>
+        <pride-mod.version>2.1.12</pride-mod.version>
     </properties>
 
     <dependencies>
@@ -99,6 +100,12 @@
             <version>${commons-math3.version}</version>
         </dependency>
 
+        <dependency>
+            <groupId>uk.ac.ebi.pride.utilities</groupId>
+            <artifactId>pride-mod</artifactId>
+            <version>${pride-mod.version}</version>
+        </dependency>
+
         <!-- Performance libraries -->
         <dependency>
             <groupId>org.ehcache</groupId>
diff --git a/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java b/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java
new file mode 100644
index 0000000..18835ee
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/io/result/IClusteringResultWriter.java
@@ -0,0 +1,24 @@
+package org.spectra.cluster.io.result;
+
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage;
+
+import java.nio.file.Path;
+
+/**
+ * Classes that write the clustering result into
+ * a specific format.
+ *
+ * @author jg
+ */
+public interface IClusteringResultWriter {
+    /**
+     * Convert a clustering result into a specific format.
+     *
+     * @param resultFile The final result file that will be created. This file must not exist.
+     * @param clusterStorage The final clustering result as an ObjectDBGreedyClusterStorage.
+     * @param spectraPropertyStorage The spectrum property storage.
+     * @throws Exception
+     */
+    public void writeResult(Path resultFile, ObjectDBGreedyClusterStorage clusterStorage, IPropertyStorage spectraPropertyStorage) throws Exception;
+}
diff --git a/src/main/java/org/spectra/cluster/io/result/MspWriter.java b/src/main/java/org/spectra/cluster/io/result/MspWriter.java
new file mode 100644
index 0000000..af554b3
--- /dev/null
+++ b/src/main/java/org/spectra/cluster/io/result/MspWriter.java
@@ -0,0 +1,221 @@
+package org.spectra.cluster.io.result;
+
+import io.github.bigbio.pgatk.io.common.spectra.Spectrum;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.StoredProperties;
+import lombok.Data;
+import lombok.extern.slf4j.Slf4j;
+import org.spectra.cluster.consensus.IConsensusSpectrumBuilder;
+import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage;
+import org.spectra.cluster.model.cluster.GreedySpectralCluster;
+import uk.ac.ebi.pride.utilities.pridemod.ModReader;
+import uk.ac.ebi.pride.utilities.pridemod.model.PTM;
+import uk.ac.ebi.pride.utilities.pridemod.model.UniModPTM;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.*;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Writes the clustering result into an MSP file.
+ *
+ * @author jg
+ */
+@Slf4j
+public class MspWriter implements IClusteringResultWriter {
+    private final IConsensusSpectrumBuilder consensusSpectrumBuilder;
+    private static final Pattern MOD_PATTERN = Pattern.compile("[+-][0-9.]+");
+
+    /**
+     * A simple helper function to describe a PTM in
+     * the style used in MSP files.
+     */
+    @Data
+    protected class MspMod {
+        private final int position;
+        private final String name;
+        private final String aminoAcid;
+    }
+
+    /**
+     * Initializes a new MspWriter
+     *
+     * @param consensusSpectrumBuilder The consensus spectrum builder to use for the
+     *                                 actual spectra
+     */
+    public MspWriter(IConsensusSpectrumBuilder consensusSpectrumBuilder) {
+        this.consensusSpectrumBuilder = consensusSpectrumBuilder;
+    }
+
+    @Override
+    public void writeResult(Path resultFile, ObjectDBGreedyClusterStorage clusterStorage, IPropertyStorage spectraPropertyStorage) throws Exception {
+        // ensure that the file does not exist
+        if (Files.exists(resultFile))
+            throw new Exception(String.format("Error: File '%s' already exists.", resultFile.toString()));
+
+        log.info(String.format("Saving clustering result as MSP file '%s'", resultFile.toString()));
+
+        // iterate over all clusters and convert them to strings
+        while (clusterStorage.hasNext()) {
+            GreedySpectralCluster cluster = (GreedySpectralCluster) clusterStorage.next();
+
+            String clusterMspString = convertCluster(cluster, spectraPropertyStorage);
+
+            Files.write(resultFile, clusterMspString.getBytes(), StandardOpenOption.CREATE, StandardOpenOption.APPEND);
+        }
+    }
+
+    private String convertCluster(GreedySpectralCluster cluster, IPropertyStorage propertyStorage) {
+        StringBuilder clusterString = new StringBuilder();
+
+        // create the consensus spectrum
+        Spectrum consensus = consensusSpectrumBuilder.createConsensusSpectrum(cluster, propertyStorage);
+
+        // get the most common PSM
+        Map<String, Long> sequenceCounts = cluster.getClusteredSpectraIds().stream()
+                .map(id -> getProperty(propertyStorage, id, StoredProperties.SEQUENCE))
+                .filter(Objects::nonNull)
+                .collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
+
+        // get the number of identified spectra
+        long nIdentified = sequenceCounts.entrySet().stream()
+                .mapToLong(Map.Entry::getValue)
+                .sum();
+
+        String maxSequence = "";
+        double maxRatio = 0.0;
+        if (sequenceCounts.size() < 1) {
+            maxSequence = "UNIDENTIFIED";
+        } else {
+            maxSequence = sequenceCounts.entrySet().stream()
+                .max(Map.Entry.comparingByValue())
+                .map(Map.Entry::getKey).orElse(null);
+
+            // get the max ratio
+            maxRatio = sequenceCounts.get(maxSequence) / (double) nIdentified;
+        }
+
+        // create the name based on this information
+        clusterString.append(String.format("Name: %s/%d\n", maxSequence, consensus.getPrecursorCharge()));
+        // TODO: Handle Naa for unidentified cluster
+        clusterString.append(String.format("Comment: Spec=Consensus Parent=%.4f Mods=%s Nreps=%d Naa=%d MaxRatio=%.3f\n",
+                consensus.getPrecursorMZ(), getModString(maxSequence),
+                cluster.getClusteredSpectraCount(), maxSequence.length(), maxRatio));
+        clusterString.append(String.format("Num peaks: %d\n", consensus.getPeakList().size()));
+
+        // add the peaks
+        consensus.getPeakList().entrySet().stream()
+                .sorted(Map.Entry.comparingByKey())
+                .forEach(peak -> clusterString.append(String.format("%f %f\n", peak.getKey(), peak.getValue())));
+
+        return clusterString.toString();
+    }
+
+    /**
+     * Creates an MSP Mod: String.
+     *
+     * @param sequence The sequence
+     * @return The mod string
+     */
+    protected String getModString(String sequence) {
+        List<MspMod> mods = extractModsFromSequence(sequence);
+
+        if (mods.size() < 1)
+            return "0";
+
+        StringBuilder modString = new StringBuilder(String.valueOf(mods.size()));
+
+        for (MspMod mod : mods) {
+            modString.append(String.format("(%d,%s,%s)", mod.getPosition(), mod.getAminoAcid(), mod.getName()));
+        }
+
+        return modString.toString();
+    }
+
+    /**
+     * Extract the PTMs from the peptide sequence with the PTM deltas.
+     *
+     * @param sequence The original sequence
+     * @return A List of MspModS
+     */
+    protected List<MspMod> extractModsFromSequence(String sequence) {
+        Matcher matcher = MOD_PATTERN.matcher(sequence);
+        List<MspMod> mods = new ArrayList<>(3);
+        int modOffset = 0;
+
+        while (matcher.find()) {
+            String modString = sequence.substring(matcher.start(), matcher.end());
+            String position = "";
+
+            if (matcher.start() == 0) {
+                position = "c-term";
+            } else if (matcher.end() == sequence.length()) {
+                position = "n-term";
+            } else {
+                position = sequence.substring(matcher.start() - 1, matcher.start());
+            }
+
+            String modName = getModNameForDelta(Double.parseDouble(modString.substring(1)), position);
+
+            if (matcher.start() == 0) {
+                mods.add(new MspMod(matcher.start() - modOffset, modName, "["));
+            } else if (matcher.end() == sequence.length()) {
+                mods.add(new MspMod(matcher.start() - modOffset, modName, "]"));
+            } else {
+                mods.add(new MspMod(matcher.start() - modOffset, modName, sequence.substring(matcher.start() - 1, matcher.start())));
+            }
+
+            // keep track of the length of the mod strings
+            modOffset += modString.length();
+        }
+        // create the pattern to extract the PTMs
+        return mods;
+    }
+
+    /**
+     * Return the modification name for the passed delta.
+     *
+     * @param delta The delta mass
+     * @param position The position of the PTM
+     * @return The mod string
+     */
+    private String getModNameForDelta(double delta, String position) {
+        ModReader modReader = ModReader.getInstance();
+        List<PTM> ptms = modReader.getAnchorMassModification(delta, 0.001, "");
+
+        // only use UNIMOD
+        ptms = ptms.stream()
+                .filter(p -> p instanceof UniModPTM)
+                .sorted(Comparator.comparing(PTM::getAccession))
+                .collect(Collectors.toList());
+
+        if (ptms.size() < 1) {
+            return String.valueOf(delta);
+        } else {
+            // simply return the first
+            return ptms.get(0).getShortName();
+        }
+    }
+
+    /**
+     * Retrieves a property from the property storage but converts any
+     * exception to an IllegalStateExcpetion
+     *
+     * @param storage
+     * @param id
+     * @param property
+     * @return
+     */
+    private String getProperty(IPropertyStorage storage, String id, String property) {
+        try {
+            return storage.get(id, property);
+        } catch (Exception e) {
+            throw new IllegalStateException(e);
+        }
+    }
+}
diff --git a/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java b/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java
new file mode 100644
index 0000000..b338439
--- /dev/null
+++ b/src/test/java/org/spectra/cluster/io/result/MspWriterTest.java
@@ -0,0 +1,127 @@
+package org.spectra.cluster.io.result;
+
+import io.github.bigbio.pgatk.io.objectdb.LongObject;
+import io.github.bigbio.pgatk.io.objectdb.ObjectsDB;
+import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
+import io.github.bigbio.pgatk.io.properties.InMemoryPropertyStorage;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.spectra.cluster.cdf.MinNumberComparisonsAssessor;
+import org.spectra.cluster.consensus.AverageConsensusSpectrumBuilder;
+import org.spectra.cluster.engine.GreedyClusteringEngine;
+import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction;
+import org.spectra.cluster.filter.rawpeaks.*;
+import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage;
+import org.spectra.cluster.io.spectra.MzSpectraReader;
+import org.spectra.cluster.model.cluster.GreedySpectralCluster;
+import org.spectra.cluster.model.cluster.ICluster;
+import org.spectra.cluster.model.cluster.IClusterProperties;
+import org.spectra.cluster.model.consensus.GreedyConsensusSpectrum;
+import org.spectra.cluster.normalizer.BasicIntegerNormalizer;
+import org.spectra.cluster.normalizer.MaxPeakNormalizer;
+import org.spectra.cluster.normalizer.TideBinner;
+import org.spectra.cluster.predicates.ShareHighestPeaksClusterPredicate;
+import org.spectra.cluster.similarity.CombinedFisherIntensityTest;
+import org.spectra.cluster.util.ClusteringParameters;
+
+import java.io.File;
+import java.net.URI;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+public class MspWriterTest {
+    Path testDir;
+
+    @Before
+    public void setUp() throws Exception {
+        testDir = Files.createTempDirectory("clusters-");
+    }
+
+    @Test
+    public void testMspWriting() throws Exception {
+        // ignore the property storage for now
+        IPropertyStorage propertyStorage = new InMemoryPropertyStorage();
+
+        IRawSpectrumFunction loadingFilter = new RemoveImpossiblyHighPeaksFunction()
+                .specAndThen(new RemovePrecursorPeaksFunction(0.5))
+                .specAndThen(new RawPeaksWrapperFunction(new KeepNHighestRawPeaks(40)));
+
+        // create a basic clustering engine for testing
+        GreedyClusteringEngine engine = new GreedyClusteringEngine(BasicIntegerNormalizer.MZ_CONSTANT,
+                1, 0.99f, 5, new CombinedFisherIntensityTest(),
+                new MinNumberComparisonsAssessor(10000), new ShareHighestPeaksClusterPredicate(5),
+                GreedyConsensusSpectrum.NOISE_FILTER_INCREMENT);
+
+        URI[] mgfFiles = new URI[] {
+                getClass().getClassLoader().getResource("same_sequence_cluster.mgf").toURI(),
+                getClass().getClassLoader().getResource("synthetic_mixed_runs.mgf").toURI()};
+        File[] inFiles = Arrays.stream(mgfFiles).map(File::new).toArray(File[]::new);
+
+        // read all files at once
+        MzSpectraReader reader = new MzSpectraReader(new TideBinner(), new MaxPeakNormalizer(),
+                new BasicIntegerNormalizer(), new HighestPeakPerBinFunction(), loadingFilter,
+                GreedyClusteringEngine.COMPARISON_FILTER, engine, inFiles);
+
+        // create the iterator
+        Iterator<ICluster> iterator = reader.readClusterIterator(propertyStorage);
+
+        // keep track of the cluster ids
+        List<IClusterProperties> clusterProperties = new ArrayList<>(10_000);
+
+        // create the output file
+        Path clusteringResult = Paths.get(testDir.toString(), "clustering_result.cls");
+        ObjectDBGreedyClusterStorage clusterStorage = new ObjectDBGreedyClusterStorage(new ObjectsDB(clusteringResult.toString(), true));
+
+        while (iterator.hasNext()) {
+            GreedySpectralCluster c = (GreedySpectralCluster) iterator.next();
+            clusterStorage.addGreedySpectralCluster(LongObject.asLongHash(c.getId()), c);
+        }
+
+        clusterStorage.writeDBMode();
+        clusterStorage.flush();
+
+        // convert
+        Path mspFile = Paths.get(testDir.toString(), "clusters.msp");
+
+        MspWriter resultWriter = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters()));
+        resultWriter.writeResult(mspFile, clusterStorage, propertyStorage);
+
+        // check that everything worked
+        Assert.assertTrue(Files.exists(mspFile));
+
+        List<String> lines = Files.readAllLines(mspFile);
+        Assert.assertEquals("Name: +42.011EVQLVETGGGLIQPGGSLR/2", lines.get(0));
+        Assert.assertEquals("Comment: Spec=Consensus Parent=977.0230 Mods=1(0,[,Acetyl) Nreps=1 Naa=26 MaxRatio=1.000", lines.get(1));
+        Assert.assertEquals("Num peaks: 50", lines.get(2));
+    }
+
+    @Test
+    public void testExtractPtms() {
+        MspWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters()));
+        String sequence = "+42.011EVQLVET+42.011GGGLIQPGGSLR+42.011";
+
+        List<MspWriter.MspMod> mods = writer.extractModsFromSequence(sequence);
+
+        Assert.assertEquals(3, mods.size());
+
+        Assert.assertEquals(0, mods.get(0).getPosition());
+        Assert.assertEquals("Acetyl", mods.get(0).getName());
+        Assert.assertEquals("[", mods.get(0).getAminoAcid());
+    }
+
+    @Test
+    public void testGetModString() {
+        MspWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(new ClusteringParameters()));
+        String sequence = "+42.011EVQLVET+42.011GGGLIQPGGSLR+42.011";
+
+        String modString = writer.getModString(sequence);
+
+        Assert.assertEquals("3(0,[,Acetyl)(7,T,Acetyl)(19,],Acetyl)", modString);
+    }
+}

From 25bda28fffbb5d0c14899efe471a76a6552fe7c9 Mon Sep 17 00:00:00 2001
From: Johannes Griss <jgriss8@yahoo.com>
Date: Sat, 29 Aug 2020 21:49:12 +0200
Subject: [PATCH 13/13] Added command line support for MSP output.

---
 .../org/spectra/cluster/tools/CliOptions.java |  7 +++++++
 .../cluster/tools/SpectraClusterTool.java     | 20 +++++++++++++++++++
 .../cluster/util/ClusteringParameters.java    |  5 +++++
 src/main/resources/application.properties     |  3 +++
 4 files changed, 35 insertions(+)

diff --git a/src/main/java/org/spectra/cluster/tools/CliOptions.java b/src/main/java/org/spectra/cluster/tools/CliOptions.java
index 93c71d5..01bc777 100644
--- a/src/main/java/org/spectra/cluster/tools/CliOptions.java
+++ b/src/main/java/org/spectra/cluster/tools/CliOptions.java
@@ -16,6 +16,7 @@ public enum OPTIONS {
 
         CONFIG_FILE("config", "c"),
         OUTPUT_PATH("output.path", "o"),
+        OUTPUT_MSP("output.msp", "om"),
 
         PRECURSOR_TOLERANCE("precursor.tolerance", "p"),
         FRAGMENT_PRECISION("fragment.precision", "f"),
@@ -104,6 +105,12 @@ public String getLongValue() {
                 .create(OPTIONS.OUTPUT_PATH.getValue());
         options.addOption(outputPath);
 
+        Option outputMsp = OptionBuilder
+                .withDescription("If set, an MSP file containing the consensus spectra is written to the same location as the outputfile.")
+                .withLongOpt(OPTIONS.OUTPUT_MSP.getLongValue())
+                .create(OPTIONS.OUTPUT_MSP.getValue());
+        options.addOption(outputMsp);
+
         Option startThreshold = OptionBuilder
                 .hasArg()
                 .withDescription("(Highest) starting threshold")
diff --git a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
index 59f4bd6..ef13668 100644
--- a/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
+++ b/src/main/java/org/spectra/cluster/tools/SpectraClusterTool.java
@@ -1,6 +1,7 @@
 package org.spectra.cluster.tools;
 
 import io.github.bigbio.pgatk.io.mapcache.IMapStorage;
+import io.github.bigbio.pgatk.io.objectdb.ObjectsDB;
 import io.github.bigbio.pgatk.io.properties.IPropertyStorage;
 import io.github.bigbio.pgatk.io.properties.PropertyStorageFactory;
 import lombok.extern.slf4j.Slf4j;
@@ -10,10 +11,14 @@
 import org.apache.commons.cli.PosixParser;
 import org.spectra.cluster.binning.IClusterBinner;
 import org.spectra.cluster.binning.SimilarSizedClusterBinner;
+import org.spectra.cluster.consensus.AverageConsensusSpectrumBuilder;
 import org.spectra.cluster.engine.GreedyClusteringEngine;
 import org.spectra.cluster.exceptions.MissingParameterException;
 import org.spectra.cluster.filter.binaryspectrum.HighestPeakPerBinFunction;
 import org.spectra.cluster.io.cluster.ClusterStorageFactory;
+import org.spectra.cluster.io.cluster.ObjectDBGreedyClusterStorage;
+import org.spectra.cluster.io.result.IClusteringResultWriter;
+import org.spectra.cluster.io.result.MspWriter;
 import org.spectra.cluster.io.spectra.MzSpectraReader;
 import org.spectra.cluster.model.cluster.GreedySpectralCluster;
 import org.spectra.cluster.model.cluster.ICluster;
@@ -26,6 +31,8 @@
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.time.Duration;
 import java.time.LocalDateTime;
 import java.util.ArrayList;
@@ -151,6 +158,19 @@ private void runClustering(String[] peakFiles, ClusteringParameters clusteringPa
 
         log.info("Result file written to " + clusteringParameters.getOutputFile());
 
+        // create the MSP file
+        if (clusteringParameters.isOutputMsp()) {
+            Path mspFile = Paths.get(clusteringParameters.getOutputFile().toString() + ".msp");
+            IClusteringResultWriter writer = new MspWriter(new AverageConsensusSpectrumBuilder(clusteringParameters));
+
+            // open the result file again
+            ObjectDBGreedyClusterStorage resultReader = new ObjectDBGreedyClusterStorage(
+                    new ObjectsDB(clusteringParameters.getOutputFile().getAbsolutePath(), false));
+
+            log.info("Saving clustering results as MSP file at " + mspFile.toString());
+            writer.writeResult(mspFile, resultReader, propertyStorage);
+        }
+
         // TODO: Create .clustering output file
 
         // close the storage
diff --git a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
index 2916df0..5d7ce6a 100644
--- a/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
+++ b/src/main/java/org/spectra/cluster/util/ClusteringParameters.java
@@ -53,6 +53,7 @@ public class ClusteringParameters {
     private int minNumberOfComparisons;
 
     private File outputFile;
+    private boolean outputMsp;
 
     private int nThreads;
 
@@ -98,6 +99,8 @@ private void setProperties(Properties properties) {
             this.nInitiallySharedPeaks = Integer.parseInt(properties.getProperty("initially.shared.peaks"));
         if(properties.containsKey("x.min.comparisons"))
             this.minNumberOfComparisons = Integer.parseInt(properties.getProperty("x.min.comparisons"));
+        if(properties.contains("output.msp"))
+            this.outputMsp = Boolean.parseBoolean(properties.getProperty("output.msp"));
     }
 
     public Properties readProperties() throws URISyntaxException {
@@ -151,6 +154,8 @@ public void mergeCommandLineArgs(CommandLine commandLine) {
 
         if (commandLine.hasOption(CliOptions.OPTIONS.N_THREADS.getValue()))
             nThreads = Integer.parseInt(commandLine.getOptionValue(CliOptions.OPTIONS.N_THREADS.getValue()));
+
+        outputMsp = commandLine.hasOption(CliOptions.OPTIONS.OUTPUT_MSP.getValue());
     }
 
     public void mergeParameters(String configFile) throws IOException {
diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties
index 9e86c47..3149508 100644
--- a/src/main/resources/application.properties
+++ b/src/main/resources/application.properties
@@ -13,6 +13,9 @@ reuse.binary.files=false
 cluster.fast.mode=false
 ignore.charge=false
 
+# also write an MSP file containing all consensus spectra
+output.msp=true
+
 # Number of peaks that will be used to perform the comparison
 number.higher.peaks=40
 initially.shared.peaks=5