Merge development into main (version update: 1.0.0)

Approved-by: Jose Gavalda Garcia
Bio2Byte · Oct 5, 2023 · 44d4090 · 44d4090
1 parent 6a3a292
commit 44d4090
Show file tree

Hide file tree

Showing 9 changed files with 193 additions and 105 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
-include constava/data/*
+include constava/data/*
+include requirements.txt
diff --git a/README.md b/README.md
diff --git a/constava/__main__.py b/constava/__main__.py
@@ -48,7 +48,7 @@ def parse_parameters(cmdline_arguments):
     genOpt = parser.add_argument_group("Generic options")
     genOpt.add_argument("-h", "--help", action="help", help=tw.dedent(
         """\
-        Show this help message and exit. For detailled 
+        Show this help message and exit. For detailed 
         information on the subcommands, run: 
         `%(prog)s SUBCOMMAND -h`"""))
     genOpt.add_argument("--version", action="version", version=f"%(prog)s {__version__}", 
@@ -82,7 +82,7 @@ def parse_parameters(cmdline_arguments):
         the continuous probability density function of the kde-Model by a fixed set
         of grid-points. The PDF for any sample is then estimated by linear 
         interpolation between the nearest grid points. This is slightly less
-        accurate then the kde-Model but speeds up inference significantly."""),
+        accurate than the kde-Model but speeds up inference significantly."""),
         formatter_class=argparse.RawTextHelpFormatter)
 
     fitIO = parser_fit_model.add_argument_group("Input and output options")
@@ -181,6 +181,12 @@ def parse_parameters(cmdline_arguments):
         """\
         Do inference using <Int> samples obtained through 
         bootstrapping. Multiple values can be provided."""))
+    anaSmpl.add_argument("--bootstrap-series", metavar="<int>", type=int, nargs='+',  help=tw.dedent(
+        """\
+        Do inference using <Int> samples obtained through 
+        bootstrapping. Return the results for every subsample
+        rather than the average. This can result in very 
+        large output files. Multiple values can be provided."""))
     anaSmpl.add_argument("--bootstrap-samples", metavar="<int>", type=int, default=500, help=tw.dedent(
         """\
         When bootstrapping, sample <Int> times from the input data.
@@ -261,6 +267,7 @@ def run_analyze(args):
     params.window = args.window
     params.window_series = args.window_series
     params.bootstrap = args.bootstrap
+    params.bootstrap_series = args.bootstrap_series
     params.bootstrap_samples = args.bootstrap_samples
     params.input_degrees = args.degrees
     params.precision = args.precision

diff --git a/constava/calc/subsampling.py b/constava/calc/subsampling.py
@@ -18,8 +18,8 @@ class SubsamplingABC(metaclass=abc.ABCMeta):
     Methods:
     --------
         calculate(state_logpdfs)
-            Calculates the coformational state likelihoods and conformational
-            state variablility.
+            Calculates the conformational state likelihoods and conformational
+            state variability.
         calculateStatePropensities(state_likelihoods)
             Calculates the average conformational state likelihood.
         calculateStateVariability(state_likelihoods)
@@ -30,8 +30,8 @@ class SubsamplingABC(metaclass=abc.ABCMeta):
             Subsamples from the distribution of original data points.
     """
     def calculate(self, state_logpdfs):
-        """Calculates the coformational state likelihoods and conformational
-        state variablility from the sampled state logPDFs.
+        """Calculates the conformational state likelihoods and conformational
+        state variability from the sampled state logPDFs.
 
         Parameters:
         -----------
@@ -44,7 +44,7 @@ def calculate(self, state_logpdfs):
                                 Average likelihood for samples to fall in any 
                                 of the M states
             state_variability : float
-                                Variablility fo the state propensities 
+                                variability fo the state propensities
                                 throughout the sampling
             
         """
@@ -115,8 +115,8 @@ class SubsamplingWindow(SubsamplingABC):
     Methods:
     --------
         calculate(state_logpdfs)
-            Calculates the coformational state likelihoods and conformational
-            state variablility.
+            Calculates the conformational state likelihoods and conformational
+            state variability.
         calculateStatePropensities(state_likelihoods)
             Calculates the average conformational state likelihood.
         calculateStateVariability(state_likelihoods)
@@ -177,7 +177,7 @@ class SubsamplingBootstrap(SubsamplingABC):
     Attributes:
     -----------
         sample_size : int
-            Number of originial data points in each bootstrapped sample.
+            Number of original data points in each bootstrapped sample.
         n_samples : int
             Number of samples to bootstrap.
         seed: int
@@ -186,8 +186,8 @@ class SubsamplingBootstrap(SubsamplingABC):
     Methods:
     --------
         calculate(state_logpdfs)
-            Calculates the coformational state likelihoods and conformational
-            state variablility.
+            Calculates the conformational state likelihoods and conformational
+            state variability.
         calculateStatePropensities(state_likelihoods)
             Calculates the average conformational state likelihood.
         calculateStateVariability(state_likelihoods)
@@ -205,7 +205,7 @@ def __init__(self, sample_size: int, n_samples = 500, seed: Optional[int] = None
         Parameters:
         -----------
             sample_size : int
-                Number of originial data points in each bootstrapped sample.
+                Number of original data points in each bootstrapped sample.
             n_samples : int
                 Number of samples to bootstrap.
             seed: int
@@ -270,12 +270,12 @@ class SubsamplingWindowSeries(SubsamplingWindow):
     Methods:
     --------
         calculate(state_logpdfs)
-            Calculates the coformational state likelihoods and conformational
-            state variablility.
+            Calculates the conformational state likelihoods and conformational
+            state variability.
         calculateStatePropensities(state_likelihoods)
-            Calculates the average conformational state likelihood.
+            Calculates the samples' conformational state likelihood.
         calculateStateVariability(state_likelihoods)
-            Calculates the  conformational state variability.
+            Calculates the conformational state variability.
         getShortName()
             Name of the method for reference in the output.
         _subsampling(state_logpdfs)
@@ -292,13 +292,73 @@ def calculateStatePropensities(self, state_likelihoods):
 
     def calculateStateVariability(self, state_likelihoods):
         """Calculates distance of the conformational states of each sample to
-        the average conformational state. 
+        the average conformational state.
 
         Parameters:
         -----------
             state_likelihoods : Array[M,N]
                 Likelihoods for each of the M states along N samples.
-        
+
+        Returns:
+        --------
+            state_var : Array[N]
+                Conformational state distances from the average
+        """
+        mean_likelihoods = np.mean(state_likelihoods, axis=1)
+        squard_dev = np.sum((state_likelihoods.T - mean_likelihoods) ** 2, axis=1)
+        state_var = np.sqrt(squard_dev)
+        return state_var
+
+
+class SubsamplingBootstrapSeries(SubsamplingBootstrap):
+    """Class to subsample the logPDF values obtained from the probabilistic 
+    conformational state models and calculate the conformational state 
+    propensities and conformational state variability. Subsampling is done 
+    using bootstrapping. For each bootstrapped subsample the results are 
+    returned.
+
+    Attributes:
+    -----------
+        sample_size : int
+            Number of original data points in each bootstrapped sample.
+        n_samples : int
+            Number of samples to bootstrap.
+        seed: int
+            Random seed used during bootstrapping
+
+    Methods:
+    --------
+        calculate(state_logpdfs)
+            Calculates the conformational state likelihoods and conformational
+            state variability.
+        calculateStatePropensities(state_likelihoods)
+            Calculates the samples' conformational state likelihood.
+        calculateStateVariability(state_likelihoods)
+            Calculates the conformational state variability.
+        getShortName()
+            Name of the method for reference in the output.
+        _subsampling(state_logpdfs)
+            Subsamples from the distribution of original data points.
+    """
+
+    def getShortName(self) -> str:
+        """Name of the method for reference in the output."""
+        return "bootstrap_series/{0:d}/{1:d}/{2}/".format(
+            self.sample_size, self.n_samples, self.seed or "")
+
+    def calculateStatePropensities(self, state_likelihoods):
+        """Calculates the conformational state likelihoods for the given sample."""
+        return state_likelihoods
+
+    def calculateStateVariability(self, state_likelihoods):
+        """Calculates distance of the conformational states of each sample to
+        the average conformational state.
+
+        Parameters:
+        -----------
+            state_likelihoods : Array[M,N]
+                Likelihoods for each of the M states along N samples.
+
         Returns:
         --------
             state_var : Array[N]

diff --git a/constava/utils/ensembles.py b/constava/utils/ensembles.py
@@ -81,8 +81,13 @@ def __repr__(self):
 
     @property
     def n_residues(self):
-        """ Returns the number of residues form the first to last residue 
-        this might include gaps (residues without data) """
+        """Returns the number of residues in the ensemble"""
+        return len(self._residues)
+
+    @property
+    def resrange(self):
+        """Returns the range from the first to last residue.
+        This might include gaps (residues without data)"""
         return 1 + self._residues[-1].respos - self._residues[0].respos
 
     @property

diff --git a/constava/wrapper/params.py b/constava/wrapper/params.py
@@ -68,6 +68,10 @@ class ConstavaParameters:
         bootstrap : List[int] or int
             Do inference using <Int> samples obtained through bootstrapping.
             Multiple values can be given as a list.
+        bootstrap_series : List[int] or int
+            Do inference using <Int> samples obtained through bootstrapping.
+            Return the results for every subsample rather than the average. Multiple 
+            values can be given as a list.
         bootstrap_samples : int
             When bootstrapping, sample <Int> times from the input data.
 
@@ -101,6 +105,7 @@ class ConstavaParameters:
     window : typing.List[int] = field(default_factory=list)
     bootstrap : typing.List[int] = field(default_factory=list)
     window_series : typing.List[int] = field(default_factory=list)
+    bootstrap_series : typing.List[int] = field(default_factory=list)
     bootstrap_samples : int = 500
 
     # Miscellaneous Options

diff --git a/constava/wrapper/wrapper.py b/constava/wrapper/wrapper.py
@@ -6,7 +6,7 @@
 from .params import ConstavaParameters
 from ..io import ResultsWriter, EnsembleReader
 from ..calc.calculator import ConfStateCalculator
-from ..calc.subsampling import SubsamplingBootstrap, SubsamplingWindow, SubsamplingWindowSeries
+from ..calc.subsampling import SubsamplingBootstrap, SubsamplingBootstrapSeries, SubsamplingWindow, SubsamplingWindowSeries
 from ..calc.csmodels import ConfStateModelABC, ConfStateModelKDE, ConfStateModelGrid
 
 # The logger for the wrapper
@@ -128,6 +128,7 @@ def run(self) -> None:
                 window = self.get_param("window"),
                 window_series = self.get_param("window_series"),
                 bootstrap = self.get_param("bootstrap"),
+                bootstrap_series = self.get_param("bootstrap_series"),
                 bootstrap_samples = self.get_param("bootstrap_samples"),
                 bootstrap_seed  = self.get_param("seed"))
 
@@ -252,8 +253,8 @@ def load_csmodel(self, pickled_csmodel: str) -> ConfStateModelABC:
 
     def initialize_calculator(self, csmodel: ConfStateModelABC = None, 
             window: List[int] = None, window_series: List[int] = None, 
-            bootstrap: List[int] = None, bootstrap_samples: int = 500, 
-            bootstrap_seed: int = None) -> ConfStateCalculator:
+            bootstrap: List[int] = None, bootstrap_series: List[int] = None,
+            bootstrap_samples: int = 500, bootstrap_seed: int = None) -> ConfStateCalculator:
         """Initializes a ConfStateCalculator.
 
         Parameters:
@@ -272,6 +273,10 @@ def initialize_calculator(self, csmodel: ConfStateModelABC = None,
             bootstrap : List[int]
                 Subsampling using by bootstrapping <int> datapoints. Multiple
                 values can be given as a list.
+            bootstrap_series : List[int] or int
+                Subsampling using by bootstrapping <int> datapoints. Returns the 
+                results for every subsample rather than the average. Multiple 
+                values can be given as a list.
             bootstrap_samples : int
                 When bootstrapping, sample <int> times from the input data.
             bootstrap_seed : int
@@ -301,4 +306,8 @@ def initialize_calculator(self, csmodel: ConfStateModelABC = None,
             new_method = SubsamplingWindowSeries(window_size)
             logger.info(f"... adding subsampling method: {new_method.getShortName()}")
             calculator.add_method(new_method)
+        for sample_size in (bootstrap_series or []):
+            new_method = SubsamplingBootstrapSeries(sample_size, bootstrap_samples, seed=bootstrap_seed)
+            logger.info(f"... adding subsampling method: {new_method.getShortName()}")
+            calculator.add_method(new_method)
         return calculator
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,4 @@
+MDAnalysis
+numpy
+pandas
+scikit-learn
diff --git a/setup.py b/setup.py
@@ -3,9 +3,12 @@
 with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 
+with open("requirements.txt", "r", encoding="utf-8") as f:
+    requirements = f.read().splitlines()
+
 setup(
     name="constava",
-    version="1.0.0b4",
+    version="1.0.0",
     author="Wim Vranken",
     author_email="[email protected]",
     description="This software is used to calculate conformational states probability & conformational state "
@@ -34,12 +37,7 @@
         "Development Status :: 5 - Production/Stable"
     ],
     python_requires=">=3.8",
-    install_requires=[
-        "MDAnalysis",
-        "numpy",
-        "pandas",
-        "scikit-learn",
-    ],
+    install_requires=requirements,
     entry_points={
         "console_scripts": [
             "constava = constava.__main__:main",