From 991e26c01360228bf36a3f942dca5ffd376d9e22 Mon Sep 17 00:00:00 2001 From: Henry Date: Sat, 10 Feb 2024 15:55:16 +0100 Subject: [PATCH] Add Sphinx documentation --- .gitignore | 3 + .readthedocs.yaml | 35 +++++++++++ README.md | 1 + doc/CAMI2.md | 16 ++--- doc/README.md | 31 ++++++++++ doc/conf.py | 147 ++++++++++++++++++++++++++++++++++++++++++++++ doc/index.rst | 24 ++++++++ pyproject.toml | 20 +++++-- 8 files changed, 264 insertions(+), 13 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 doc/README.md create mode 100644 doc/conf.py create mode 100644 doc/index.rst diff --git a/.gitignore b/.gitignore index dbe311ab..9bcde12b 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ TODO.md build/ dist/ **.vscode +# doc +doc/_build +doc/reference \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..79053004 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.10" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: doc/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/README.md b/README.md index 09344df2..9d99fd4a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # Vamb +[![Read the Docs](https://readthedocs.org/projects/vamb/badge/?version=latest)](https://vamb.readthedocs.io/en/latest/) Created by Jakob Nybo Nissen and Simon Rasmussen, Technical University of Denmark and Novo Nordisk Foundation Center for Protein Research, University of Copenhagen. diff --git a/doc/CAMI2.md b/doc/CAMI2.md index 20dbcbc5..0f13da3e 100644 --- a/doc/CAMI2.md +++ b/doc/CAMI2.md @@ -7,14 +7,14 @@ manner, and enables downstream users to pick the best tool for the job. In CAMI2, the Vamb (commit fa045c0) results were highly anomalous. In this text, I (the first author of Vamb), attempts to interpret the results. -### TL;DR +## TL;DR Vamb's results were poor in CAMI2. This is explained by Vamb being the only binner that included preprocessing but no postprocessing. This combination is disaterous, and not recommended for any binner, including Vamb. Based on measures not affected by postprocessing, Vamb was the best non-ensemble binner. If Vamb's best practises had been used (binsplitting), it would presumably be better still. -### Vamb's observed results in CAMI2 +## Vamb's observed results in CAMI2 In short, here's how the results appears: * Vamb is - by far - the binner with the lowest mean bin completeness @@ -30,13 +30,13 @@ than on non-unique, compared to other binners * Overall, Vamb is ranked 9/12, 5/7 and 7/10 in the three datasets. * Vamb was significantly slower to run than MetaBAT by a factor of ~600 for one dataset. -### So... what happened? +## So... what happened? On the surface, Vamb appears to weigh purity much higher than other binners, which leads to poor average performance. However, digging a little deeper, the true reason for Vamb's strange results appear - and it turns out that Vamb's results are mostly incomparable with the other binners'. -### Unlike the other binners, Vamb's bins were subject to no postprocessing +## Unlike the other binners, Vamb's bins were subject to no postprocessing Unlike other binners, Vamb outputs every single input contig. Contigs that could not be binned with other contigs are simply output as a single-contig bin. This means that the large majority of bins will be composed of one or a few hard-to-bin contigs. @@ -59,7 +59,7 @@ This choice is vindicated by CAMI2's result. When binning plasmids, Vamb far exceeds all other binners (F1 = 70.8 vs the next best's 12.7). Plasmids are simply filtered away by the other binners, but not by Vamb! -### Preprocessing without postprocessing gave the worst of both worlds +## Preprocessing without postprocessing gave the worst of both worlds In the CAMI2 paper, besides precision and recall, the two measures of binners are: 1. ARI: adjusted Rand index, a measure of the similarity between a binner's @@ -83,7 +83,7 @@ pre- and postprocessing had been applied, it would have done much better at ARI. The results in CAMI2, as included, represents the worst possible combination, and does not represent any recommended workflow with Vamb. -### Because the input data was co-assembled, binsplitting was not used +## Because the input data was co-assembled, binsplitting was not used Vamb's README recommends users to use _binsplitting_. This simple technique applies to binning multiple samples from similar environments. Using it means assembling each sample individually, binning all contigs together across samples, splitting the @@ -97,7 +97,7 @@ was not possible. I suspect this led to poorer performance of Vamb, because Vamb by being designed for binsplitting, may be designed to purposefully bin similar strains from different together, which would usually be separated by binsplitting. -### What would a more useful comparison look like? +## What would a more useful comparison look like? CAMI2's supplementary material include a table where they count the number of high-quality genomes recovered using each binner. Using a >90% recall, <5% contamination threshold, and looking at the MEGAHIT assembled input data (more realistic), Vamb @@ -173,4 +173,4 @@ a Python session and playing around with its internals. * Vamb is modular. Its contig parsing and encoding is distinct from its depth calculation, which is distinct from its variational encoding, which is distinct from its clustering. This is only possible because each part of Vamb is not opinionated -about _how_ the input was created, only _what_ the input is. \ No newline at end of file +about _how_ the input was created, only _what_ the input is. diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 00000000..38a81fb5 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,31 @@ +# Docs creation + +In order to build the docs you need to + + 1. install sphinx and additional support packages + 2. build the package reference files + 3. run sphinx to create a local html version + +The documentation is build using readthedocs automatically. + +Install the docs dependencies of the package (as speciefied in toml): + +```bash +# in main folder +pip install '.[docs]' +``` + +## Build docs using Sphinx command line tools + +Command to be run from `path/to/doc`, i.e. from within the `doc` folder: + +Options: + - `--separate` to build separate pages for each (sub-)module + +```bash +# pwd: doc +# apidoc +# sphinx-apidoc --force --implicit-namespaces --module-first -o reference ../vamb +# build docs +sphinx-build -n -W --keep-going -b html ./ ./_build/ +``` diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 00000000..4c8d2895 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,147 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +from importlib import metadata + + +# -- Project information ----------------------------------------------------- + +project = "vamb" +copyright = "2024, Jakob Nybo Nissen, Simon Rasmussen" # ! please update +author = "Jakob Nybo Nissen, Simon Rasmussen" +PACKAGE_VERSION = metadata.version("vamb") +version = PACKAGE_VERSION +release = PACKAGE_VERSION + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autodoc.typehints", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx_new_tab_link", + "myst_nb", +] + +# https://myst-nb.readthedocs.io/en/latest/computation/execute.html +nb_execution_mode = "auto" + +myst_enable_extensions = ["dollarmath", "amsmath"] + +# Plolty support through require javascript library +# https://myst-nb.readthedocs.io/en/latest/render/interactive.html#plotly +html_js_files = [ + "https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js" +] + +# https://myst-nb.readthedocs.io/en/latest/configuration.html +# Execution +nb_execution_raise_on_error = True +# Rendering +nb_merge_streams = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + ".npz", +] + + +# Intersphinx options +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "torch": ("https://pytorch.org/docs/stable/index.html", None), + "numpy": ("https://numpy.org/doc/stable/", None), + # "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + # "scikit-learn": ("https://scikit-learn.org/stable/", None), + # "matplotlib": ("https://matplotlib.org/stable/", None), +} + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# See: +# https://github.com/executablebooks/MyST-NB/blob/master/docs/conf.py +# html_title = "" +html_theme = "sphinx_book_theme" +# html_logo = "_static/logo-wide.svg" +# html_favicon = "_static/logo-square.svg" +html_theme_options = { + "github_url": "https://github.com/RasmussenLab/vamb", + "repository_url": "https://github.com/RasmussenLab/vamb", + "repository_branch": "main", + "home_page_in_toc": True, + "path_to_docs": "docs", + "show_navbar_depth": 1, + "use_edit_page_button": True, + "use_repository_button": True, + "use_download_button": True, + "launch_buttons": { + "colab_url": "https://colab.research.google.com" + # "binderhub_url": "https://mybinder.org", + # "notebook_interface": "jupyterlab", + }, + "navigation_with_keys": False, +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ["_static"] + + +# -- Setup for sphinx-apidoc ------------------------------------------------- + +# Read the Docs doesn't support running arbitrary commands like tox. +# sphinx-apidoc needs to be called manually if Sphinx is running there. +# https://github.com/readthedocs/readthedocs.org/issues/1139 + +if os.environ.get("READTHEDOCS") == "True": + from pathlib import Path + + PROJECT_ROOT = Path(__file__).parent.parent + PACKAGE_ROOT = PROJECT_ROOT / "vamb" + +# def run_apidoc(_): +# from sphinx.ext import apidoc +# +# apidoc.main( +# [ +# "--force", +# "--implicit-namespaces", +# "--module-first", +# "--separate", +# "-o", +# str(PROJECT_ROOT / "doc" / "reference"), +# str(PACKAGE_ROOT), +# str(PACKAGE_ROOT / "*.c"), +# str(PACKAGE_ROOT / "*.so"), +# ] +# ) +# +# def setup(app): +# app.connect("builder-inited", run_apidoc) diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 00000000..7f1fb19e --- /dev/null +++ b/doc/index.rst @@ -0,0 +1,24 @@ +Variational Autoencoder for Metagenomic Binning (VAMB) +======================================================= + +.. include:: ../README.md + :parser: myst_parser.sphinx_ + :start-line: 1 + +.. toctree:: + :maxdepth: 2 + :caption: Tutorial + + tutorial.md + +.. toctree:: + :caption: Setup Documentation + + README + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/pyproject.toml b/pyproject.toml index 26b4739e..9919e23a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ +# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html [project] -name = "vamb" dynamic = ["version"] +name = "vamb" dependencies = [ "numpy == 1.26.4", "torch == 2.3.1", @@ -17,16 +18,25 @@ dependencies = [ requires-python = "<3.13,>=3.9.0" scripts = {vamb = "vamb.__main__:main"} +[project.optional-dependencies] +docs = [ + "sphinx", + "sphinx-book-theme", + "myst-nb", + "ipywidgets", + "sphinx-new-tab-link!=0.2.2", +] + [metadata] authors = [ - {name="Jakob Nybo Nissen", email="jakobnybonissen@gmail.com"}, - {name="Pau Piera", email="pau.piera@cpr.ku.dk"}, - {name="Simon Rasmussen", email="simon.rasmussen@cpr.ku.dk"} + {name = "Jakob Nybo Nissen", email = "jakobnybonissen@gmail.com"}, + {name = "Pau Piera", email = "pau.piera@cpr.ku.dk"}, + {name = "Simon Rasmussen", email = "simon.rasmussen@cpr.ku.dk"}, ] -url = "https://github.com/RasmussenLab/vamb" description = "Variational and Adversarial autoencoders for Metagenomic Binning" license = "MIT" readme = {file = "README.md"} +url = "https://github.com/RasmussenLab/vamb" [build-system] requires = [