From 9b4284f256c0a202f44690d9974101516bc798a4 Mon Sep 17 00:00:00 2001 From: Acribbs Date: Wed, 1 Jan 2025 20:28:17 +0100 Subject: [PATCH 1/4] updated docs to fix formatting issues --- docs/getting_started/tutorial.md | 261 ++++++++++++++++++++++++++++--- docs/index.md | 88 +++++++---- 2 files changed, 299 insertions(+), 50 deletions(-) diff --git a/docs/getting_started/tutorial.md b/docs/getting_started/tutorial.md index 0d44309..fb8479f 100644 --- a/docs/getting_started/tutorial.md +++ b/docs/getting_started/tutorial.md @@ -182,25 +182,248 @@ def sort_bam(infile, outfile): ## Best Practices -1. **Code Organization** - - Use clear task names - - Group related tasks - - Document pipeline steps - -2. **Resource Management** - - Set appropriate memory/CPU requirements - - Use temporary directories - - Clean up intermediate files - -3. **Error Handling** - - Implement proper error checking - - Use informative error messages - - Clean up on failure - -4. **Documentation** - - Add docstrings to tasks - - Document configuration options - - Include usage examples +### Code Organization + +#### 1. Task Structure +- Use meaningful task names +- Group related tasks together +- Keep tasks focused and single-purpose +- Document task dependencies clearly + +#### 2. File Management +- Use consistent file naming patterns +- Organize output directories logically +- Clean up temporary files +- Handle file paths safely + +#### 3. Documentation +- Add docstrings to all tasks +- Document configuration parameters +- Include usage examples +- Maintain a clear README + +### Resource Management + +#### 1. Memory Usage +- Set appropriate memory limits +- Scale memory with input size +- Monitor memory consumption +- Handle memory errors gracefully + +```python +@transform("*.bam", suffix(".bam"), ".sorted.bam") +def sort_bam(infile, outfile): + """Sort BAM file with memory scaling.""" + # Scale memory based on input size + infile_size = os.path.getsize(infile) + job_memory = "%dG" % max(4, infile_size // (1024**3) + 2) + + statement = """ + samtools sort -m %(job_memory)s %(infile)s > %(outfile)s + """ + P.run(statement) +``` + +#### 2. CPU Allocation +- Set appropriate thread counts +- Consider cluster limitations +- Scale threads with task needs +- Monitor CPU usage + +```python +@transform("*.fa", suffix(".fa"), ".indexed") +def index_genome(infile, outfile): + """Index genome with appropriate thread count.""" + # Set threads based on system + job_threads = min(4, os.cpu_count()) + + statement = """ + bwa index -t %(job_threads)s %(infile)s + """ + P.run(statement) +``` + +#### 3. Temporary Files +- Use proper temporary directories +- Clean up after task completion +- Handle cleanup in error cases +- Monitor disk usage + +```python +@transform("*.bam", suffix(".bam"), ".sorted.bam") +def sort_with_temp(infile, outfile): + """Sort using managed temporary directory.""" + tmpdir = P.get_temp_dir() + try: + statement = """ + samtools sort -T %(tmpdir)s/sort %(infile)s > %(outfile)s + """ + P.run(statement) + finally: + P.cleanup_tmpdir() +``` + +### Error Handling + +#### 1. Task Failures +- Implement proper error checking +- Log informative error messages +- Clean up on failure +- Provide recovery options + +```python +@transform("*.txt", suffix(".txt"), ".processed") +def process_with_errors(infile, outfile): + """Process files with error handling.""" + try: + statement = """ + process_data %(infile)s > %(outfile)s + """ + P.run(statement) + except P.PipelineError as e: + L.error("Processing failed: %s" % e) + # Cleanup and handle error + cleanup_and_notify() + raise +``` + +#### 2. Input Validation +- Check input file existence +- Validate input formats +- Verify parameter values +- Handle missing data + +```python +@transform("*.bam", suffix(".bam"), ".stats") +def calculate_stats(infile, outfile): + """Calculate statistics with input validation.""" + # Check input file + if not os.path.exists(infile): + raise ValueError("Input file not found: %s" % infile) + + # Verify file format + if not P.is_valid_bam(infile): + raise ValueError("Invalid BAM file: %s" % infile) + + statement = """ + samtools stats %(infile)s > %(outfile)s + """ + P.run(statement) +``` + +#### 3. Logging +- Use appropriate log levels +- Include relevant context +- Log progress and milestones +- Maintain log rotation + +```python +@transform("*.data", suffix(".data"), ".processed") +def process_with_logging(infile, outfile): + """Process with comprehensive logging.""" + L.info("Starting processing of %s" % infile) + + try: + statement = """ + process_data %(infile)s > %(outfile)s + """ + P.run(statement) + L.info("Successfully processed %s" % infile) + except Exception as e: + L.error("Failed to process %s: %s" % (infile, e)) + raise +``` + +### Pipeline Configuration + +#### 1. Parameter Management +- Use configuration files +- Set sensible defaults +- Document parameters +- Validate parameter values + +```yaml +# pipeline.yml +pipeline: + name: example_pipeline + version: 1.0.0 + +# Resource configuration +cluster: + memory_default: 4G + threads_default: 1 + queue: main + +# Processing parameters +params: + min_quality: 20 + max_threads: 4 + chunk_size: 1000 +``` + +#### 2. Environment Setup +- Use virtual environments +- Document dependencies +- Version control configuration +- Handle platform differences + +```bash +# Create virtual environment +python -m venv pipeline-env + +# Install dependencies +pip install -r requirements.txt + +# Set environment variables +export PIPELINE_CONFIG=/path/to/pipeline.yml +``` + +#### 3. Testing +- Write unit tests +- Test with sample data +- Verify outputs +- Monitor performance + +```python +def test_pipeline(): + """Test pipeline with sample data.""" + # Run pipeline + statement = """ + python pipeline.py make all --local + """ + P.run(statement) + + # Verify outputs + assert os.path.exists("expected_output.txt") + assert check_output_validity("expected_output.txt") +``` + +### Troubleshooting + +If you encounter issues: + +1. **Check Logs** + - Review pipeline logs + - Check cluster logs + - Examine error messages + - Monitor resource usage + +2. **Common Issues** + - Memory allocation errors + - File permission problems + - Cluster queue issues + - Software version conflicts + +3. **Getting Help** + - Check documentation + - Search issue tracker + - Ask on forums + - Contact support team + +For more detailed information, see: +- [Pipeline Overview](../pipeline_modules/overview.md) +- [Cluster Configuration](../pipeline_modules/cluster.md) +- [Error Handling](../pipeline_modules/execution.md) ## Next Steps diff --git a/docs/index.md b/docs/index.md index ca539ad..b3cf1bf 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,61 +8,87 @@ Welcome to the CGAT-core documentation! CGAT-core is a powerful Python framework ## Key Features -- **Pipeline Management**: Build and execute complex computational workflows using Ruffus -- **Cluster Integration**: Support for multiple cluster platforms (SLURM, SGE, PBS/Torque) -- **Cloud Support**: Native integration with AWS S3, Google Cloud, and Azure +- **Pipeline Management**: Build and execute complex computational pipelines +- **Cluster Support**: Seamless integration with various cluster environments (SLURM, SGE, PBS) +- **Cloud Integration**: Native support for AWS S3 and other cloud services - **Resource Management**: Intelligent handling of compute resources and job distribution - **Container Support**: Execute pipeline tasks in containers for reproducibility ## Getting Started -1. [Installation Guide](getting_started/installation.md) - - System requirements - - Installation methods - - Verification steps +### Installation Guide +- [System Requirements](getting_started/installation.md#system-requirements) +- [Installation Methods](getting_started/installation.md#installation-methods) +- [Verification Steps](getting_started/installation.md#verification-steps) -2. [Tutorial](getting_started/tutorial.md) - - Basic pipeline concepts - - Running your first pipeline - - Troubleshooting tips +### Tutorial +- [Basic Pipeline Concepts](getting_started/tutorial.md#basic-concepts) +- [Running Your First Pipeline](getting_started/tutorial.md#first-pipeline) +- [Troubleshooting Tips](getting_started/tutorial.md#troubleshooting) -3. [Examples](getting_started/examples.md) - - Common use cases - - Pipeline patterns - - Best practices +### Examples +- [Common Use Cases](getting_started/examples.md#common-use-cases) +- [Pipeline Patterns](getting_started/examples.md#pipeline-patterns) +- [Best Practices](getting_started/examples.md#best-practices) ## Core Components ### Pipeline Development +#### Writing Workflows +- [Create Custom Pipeline Workflows](defining_workflow/writing_workflows.md) -- [Writing Workflows](defining_workflow/writing_workflows.md): Create custom pipeline workflows -- [Run Parameters](getting_started/run_parameters.md): Configure pipeline execution -- [Pipeline Modules](pipeline_modules/overview.md): Core pipeline components +#### Run Parameters +- [Configure Pipeline Execution](getting_started/run_parameters.md) + +#### Pipeline Modules +- [Core Pipeline Components](pipeline_modules/overview.md) ### Execution Environments -- [Cluster Configuration](pipeline_modules/cluster.md): Set up cluster execution -- [Container Support](container/whole_pipeline.md): Run pipelines in containers -- [Cloud Integration](s3_integration/configuring_s3.md): Work with cloud storage +#### Cluster Configuration +- [Set up Cluster Execution](pipeline_modules/cluster.md) + +#### Container Support +- [Run Pipelines in Containers](container/whole_pipeline.md) + +#### Cloud Integration +- [Work with Cloud Storage](s3_integration/configuring_s3.md) ### Advanced Features -- [Parameter Management](pipeline_modules/parameters.md): Handle pipeline parameters -- [Execution Control](pipeline_modules/execution.md): Manage task execution -- [Database Integration](pipeline_modules/database.md): Work with databases +#### Parameter Management +- [Handle Pipeline Parameters](pipeline_modules/parameters.md) + +#### Execution Control +- [Manage Task Execution](pipeline_modules/execution.md) + +#### Database Integration +- [Work with Databases](pipeline_modules/database.md) ## Project Information -- [How to Contribute](project_info/how_to_contribute.md) -- [Citations](project_info/citations.md) -- [License](project_info/license.md) -- [FAQ](project_info/faq.md) +### How to Contribute +- [Contributing Guidelines](project_info/how_to_contribute.md) + +### Citations +- [Citation Information](project_info/citations.md) + +### License +- [License Information](project_info/license.md) + +### FAQ +- [Frequently Asked Questions](project_info/faq.md) ## Additional Resources -- [API Documentation](function_doc/pipeline.md) -- [GitHub Repository](https://github.com/cgat-developers/cgat-core) -- [Issue Tracker](https://github.com/cgat-developers/cgat-core/issues) +### API Documentation +- [API Reference](function_doc/pipeline.md) + +### GitHub Repository +- [CGAT-core GitHub Repository](https://github.com/cgat-developers/cgat-core) + +### Issue Tracker +- [CGAT-core Issue Tracker](https://github.com/cgat-developers/cgat-core/issues) ## Need Help? From 75efcafc650068a819187d381f8823df81dc8cb9 Mon Sep 17 00:00:00 2001 From: Acribbs Date: Wed, 1 Jan 2025 20:50:26 +0100 Subject: [PATCH 2/4] New documentation theme --- docs/defining_workflow/run_parameter.md | 39 ---- docs/overrides/main.html | 16 ++ docs/stylesheets/extra.css | 226 ++++++++++++++++++++++++ mkdocs.yml | 74 +++++++- 4 files changed, 312 insertions(+), 43 deletions(-) delete mode 100644 docs/defining_workflow/run_parameter.md create mode 100644 docs/overrides/main.html create mode 100644 docs/stylesheets/extra.css diff --git a/docs/defining_workflow/run_parameter.md b/docs/defining_workflow/run_parameter.md deleted file mode 100644 index a1eca9d..0000000 --- a/docs/defining_workflow/run_parameter.md +++ /dev/null @@ -1,39 +0,0 @@ -# Setting run parameters - -Our workflows are executed using default settings that specify parameters for requirements such as memory, threads, environment, etc. Each of these parameters can be modified within the pipeline as needed. - -## Modifiable run parameters - -- **`job_memory`**: Number of slots (threads/cores/CPU) to use for the task. Default: "4G". -- **`job_total_memory`**: Total memory to use for a job. -- **`to_cluster`**: Send the job to the cluster. Default: `True`. -- **`without_cluster`**: Run the job locally when set to `True`. Default: `False`. -- **`cluster_memory_ulimit`**: Restrict virtual memory. Default: `False`. -- **`job_condaenv`**: Name of the conda environment to use for each job. Default: will use the one specified in `bashrc`. -- **`job_array`**: If set to `True`, run the statement as an array job. `job_array` should be a tuple with start, end, and increment values. Default: `False`. - -## Specifying parameters to a job - -Parameters can be set within a pipeline task as follows: - -```python -@transform('*.unsorted', suffix('.unsorted'), '.sorted') -def sortFile(infile, outfile): - statement = '''sort -t %(tmpdir)s %(infile)s > %(outfile)s''' - P.run(statement, - job_condaenv="sort_environment", - job_memory="30G", - job_threads=2, - without_cluster=False, - job_total_memory="50G") -``` - -In this example, the `sortFile` function sorts an unsorted file and saves it as a new sorted file. The `P.run()` statement is used to specify various parameters: - -- `job_condaenv="sort_environment"`: This specifies that the task should use the `sort_environment` conda environment. -- `job_memory="30G"`: This sets the memory requirement for the task to 30GB. -- `job_threads=2`: The task will use 2 threads. -- `without_cluster=False`: This ensures the job is sent to the cluster. -- `job_total_memory="50G"`: The total memory allocated for the job is 50GB. - -These parameters allow fine-tuning of job execution to fit specific computational requirements, such as allocating more memory or running on a local machine rather than a cluster. \ No newline at end of file diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 0000000..480ea48 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1,16 @@ +{% extends "base.html" %} + +{% block site_meta %} +{{ super() }} + +{% endblock %} + +{% block announce %} + + For updates follow @cgat-developers on + + {% include ".icons/fontawesome/brands/github.svg" %} + + GitHub + +{% endblock %} diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 0000000..372de4e --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,226 @@ +/* CGAT-Core Custom Theme Colors */ +:root { + --md-primary-fg-color: #A4216A; + --md-primary-fg-color--light: #D1E751; + --md-primary-fg-color--dark: #6DCFF6; + --md-accent-fg-color: #F7941E; + --md-accent-fg-color--transparent: #F7941E1A; +} + +/* Header customization */ +.md-header { + background-color: white; + color: #2C2C2C; + box-shadow: 0 0 0.2rem rgba(0,0,0,.1), 0 0.2rem 0.4rem rgba(0,0,0,.2); +} + +.md-header__title { + color: #2C2C2C; +} + +.md-header__button { + color: #2C2C2C; +} + +.md-search__input { + background-color: rgba(0,0,0,.05); + color: #2C2C2C; +} + +.md-search__input::placeholder { + color: #2C2C2C; + opacity: 0.7; +} + +/* Navigation tabs customization */ +.md-tabs { + background-color: var(--md-primary-fg-color); + color: white; +} + +.md-tabs__link { + color: white; + opacity: 0.7; +} + +.md-tabs__link--active, +.md-tabs__link:hover { + color: white; + opacity: 1; +} + +/* Light mode customizations */ +[data-md-color-scheme="default"] { + --md-default-fg-color: #1A1A1A; + --md-default-fg-color--light: #2C2C2C; + --md-default-fg-color--lighter: #404040; + --md-default-fg-color--lightest: #686868; + --md-default-bg-color: #FFFFFF; + --md-code-bg-color: #F5F5F5; + --md-code-fg-color: #24292E; + --md-typeset-a-color: #A4216A; + --md-typeset-mark-color: #D1E75133; +} + +/* Dark mode customizations */ +[data-md-color-scheme="slate"] { + --md-default-fg-color: #FFFFFF; + --md-default-fg-color--light: #E5E5E5; + --md-default-bg-color: #1A1A1A; + --md-code-bg-color: #2D2D2D; + --md-code-fg-color: #E5E5E5; + --md-typeset-a-color: #6DCFF6; + --md-typeset-mark-color: #D1E75133; +} + +/* Main content text customization */ +.md-typeset { + color: var(--md-default-fg-color); + font-size: 0.8rem; + line-height: 1.6; +} + +.md-typeset h1, +.md-typeset h2, +.md-typeset h3, +.md-typeset h4, +.md-typeset h5, +.md-typeset h6 { + color: var(--md-default-fg-color); + font-weight: 600; +} + +/* Navigation customization */ +.md-nav__item { + color: var(--md-default-fg-color); +} + +.md-nav__link { + color: var(--md-default-fg-color) !important; +} + +.md-nav__link:hover { + color: var(--md-accent-fg-color) !important; +} + +.md-nav__link--active { + color: var(--md-primary-fg-color) !important; +} + +/* Button customization */ +.md-button { + background-color: var(--md-primary-fg-color); + border-color: var(--md-primary-fg-color); + color: white !important; +} + +.md-button:hover { + background-color: var(--md-accent-fg-color); + border-color: var(--md-accent-fg-color); +} + +/* Code block customization */ +.highlight .nx { + color: var(--md-code-fg-color); +} + +/* Admonition customization */ +.md-typeset .admonition, +.md-typeset details { + border-left: 0.2rem solid var(--md-primary-fg-color); +} + +.md-typeset .admonition.note, +.md-typeset details.note { + border-left-color: var(--md-accent-fg-color); +} + +/* Search highlighting */ +.md-search-result__link:hover { + background-color: var(--md-accent-fg-color--transparent); +} + +/* Table customization */ +.md-typeset table:not([class]) th { + background-color: var(--md-primary-fg-color); + color: var(--md-default-bg-color); +} + +/* Progress bars */ +.md-progress__bar { + background-color: var(--md-primary-fg-color); +} + +/* Footer customization */ +.md-footer { + background-color: var(--md-primary-fg-color); +} + +.md-footer-meta { + background-color: var(--md-primary-fg-color--dark); +} + +/* Logo customization */ +.md-header__button.md-logo { + padding: 0.2rem; + margin: 0.2rem; +} + +.md-header__button.md-logo img { + height: 2.2rem; + width: auto; +} + +/* Reduce logo size on mobile */ +@media screen and (max-width: 76.1875em) { + .md-header__button.md-logo img { + height: 1.8rem; + } +} + +/* Center logo vertically */ +.md-header__button.md-logo { + display: flex; + align-items: center; +} + +/* Custom logo size */ +.md-header__button.md-logo img, +.md-header__button.md-logo svg { + height: 2rem; + width: auto; +} + +/* Repository link customization */ +.md-header__source { + background-color: rgba(0,0,0,.05); + color: #2C2C2C; +} + +/* Search overlay customization */ +.md-search__overlay { + background-color: white; +} + +/* Announcement bar customization */ +.md-announce { + background-color: var(--md-primary-fg-color); + color: white; +} + +.md-announce__inner { + color: white; +} + +.md-announce a { + color: white; + text-decoration: underline; +} + +.md-announce a:hover { + color: var(--md-primary-fg-color--light); +} + +.md-announce .twemoji { + color: white; +} diff --git a/mkdocs.yml b/mkdocs.yml index 8abf2a6..21d6c76 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,15 +1,74 @@ site_name: CGAT-core Documentation -theme: material +theme: + name: material + logo: img/CGAT_logo.png + favicon: img/CGAT_logo.png + palette: + # Light mode + - media: "(prefers-color-scheme: light)" + scheme: default + primary: custom + accent: custom + toggle: + icon: material/toggle-switch + name: Switch to dark mode + # Dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: custom + accent: custom + toggle: + icon: material/toggle-switch-off-outline + name: Switch to light mode + + features: + - navigation.instant + - navigation.tracking + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.tabs.link + - content.code.annotation + - content.code.copy + + font: + text: Roboto + code: Roboto Mono + + icon: + repo: fontawesome/brands/github + + custom_dir: docs/overrides + +extra_css: + - stylesheets/extra.css + plugins: - search - mkdocstrings: default_handler: python + handlers: + python: + rendering: + show_source: true + show_if_no_docstring: true markdown_extensions: - admonition - codehilite + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - attr_list + - md_in_html - toc: permalink: true + nav: - Home: index.md - Getting Started: @@ -17,7 +76,7 @@ nav: - Cluster Configuration: getting_started/run_parameters.md - Running a Pipeline: getting_started/examples.md - Running a Pipeline Tutorial: getting_started/tutorial.md - - Run Parameters: defining_workflow/run_parameters.md + - Run Parameters: getting_started/run_parameters.md - Building a Workflow: - Workflow Overview: defining_workflow/writing_workflows.md - Writing a Workflow Tutorial: defining_workflow/tutorial.md @@ -30,9 +89,12 @@ nav: - Execution: pipeline_modules/execution.md - Utils: pipeline_modules/utils.md - Parameters: pipeline_modules/parameters.md + - Executors: pipeline_modules/executors.md + - Farm: pipeline_modules/farm.md + - Run Functions: pipeline_modules/run_function.md - Container support: - Individual tasks: container/tasks.md - - Whole pipeline: docs/container/whole_pipeline.md + - Whole pipeline: container/whole_pipeline.md - S3 Cloud: - S3 Pipeline: s3_integration/s3_pipeline.md - S3 Decorators: s3_integration/s3_decorators.md @@ -54,4 +116,8 @@ nav: - How to Contribute: project_info/how_to_contribute.md - Citations: project_info/citations.md - FAQ: project_info/faq.md - - License: project_info/license.md \ No newline at end of file + - License: project_info/license.md + +repo_url: https://github.com/cgat-developers/cgat-core +repo_name: cgat-developers/cgat-core +edit_uri: edit/main/docs/ \ No newline at end of file From c891bffc3a987288605eff883db1f72d2211e38b Mon Sep 17 00:00:00 2001 From: Acribbs Date: Wed, 1 Jan 2025 20:51:55 +0100 Subject: [PATCH 3/4] mkdoc github actions updated for new theme --- .github/workflows/cgatcore_python.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cgatcore_python.yml b/.github/workflows/cgatcore_python.yml index 00cba1f..50ecbb6 100644 --- a/.github/workflows/cgatcore_python.yml +++ b/.github/workflows/cgatcore_python.yml @@ -81,9 +81,15 @@ jobs: - name: Install MkDocs and Dependencies run: | - pip install mkdocs mkdocs-material mkdocstrings[python] + pip install mkdocs \ + mkdocs-material \ + mkdocstrings[python] \ + pymdown-extensions \ + mkdocs-material[imaging] - name: Build and Deploy MkDocs Site - run: mkdocs gh-deploy --force --clean + run: | + mkdocs build --strict + mkdocs gh-deploy --force --clean env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 9869f45a768847bc02e54a69f46db1c011164fbe Mon Sep 17 00:00:00 2001 From: Acribbs Date: Wed, 1 Jan 2025 21:01:36 +0100 Subject: [PATCH 4/4] Updated broken links and fixed mkdocs build --- .github/workflows/cgatcore_python.yml | 2 +- docs/getting_started/examples.md | 26 +++++++-------- docs/getting_started/installation.md | 18 +++++----- docs/getting_started/tutorial.md | 48 +++++++++++++-------------- docs/index.md | 2 +- docs/s3_integration/configuring_s3.md | 5 ++- 6 files changed, 50 insertions(+), 51 deletions(-) diff --git a/.github/workflows/cgatcore_python.yml b/.github/workflows/cgatcore_python.yml index 50ecbb6..4036db5 100644 --- a/.github/workflows/cgatcore_python.yml +++ b/.github/workflows/cgatcore_python.yml @@ -89,7 +89,7 @@ jobs: - name: Build and Deploy MkDocs Site run: | - mkdocs build --strict + mkdocs build mkdocs gh-deploy --force --clean env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/docs/getting_started/examples.md b/docs/getting_started/examples.md index 65bd620..8c66d71 100644 --- a/docs/getting_started/examples.md +++ b/docs/getting_started/examples.md @@ -228,13 +228,13 @@ When running the pipeline, make sure to specify `--no-cluster` as a command line - **Logs**: Check the log files generated during the pipeline run for detailed error messages. - **Support**: For further assistance, refer to the [CGAT-core documentation](https://cgat-developers.github.io/cgat-core/) or raise an issue on our [GitHub repository](https://github.com/cgat-developers/cgat-core/issues). -## CGAT-core Examples +## CGAT-core Examples {#cgat-core-examples} This guide provides practical examples of CGAT-core pipelines for various use cases, from basic file processing to complex genomics workflows. -## Quick Start Examples +## Quick Start Examples {#quick-start-examples} -### Hello World Pipeline +### Hello World Pipeline {#hello-world-pipeline} ```python """hello_world.py - Simple CGAT pipeline example @@ -271,7 +271,7 @@ if __name__ == "__main__": sys.exit(P.main(sys.argv)) ``` -### Configuration Example +### Configuration Example {#configuration-example} ```yaml # pipeline.yml @@ -287,9 +287,9 @@ cluster: memory_default: 1G ``` -## Real-World Examples +## Real-World Examples {#real-world-examples} -### 1. Genomics Pipeline +### 1. Genomics Pipeline {#genomics-pipeline} This example demonstrates a typical RNA-seq analysis pipeline: @@ -380,7 +380,7 @@ if __name__ == "__main__": sys.exit(P.main(sys.argv)) ``` -### 2. Data Processing Pipeline +### 2. Data Processing Pipeline {#data-processing-pipeline} Example of a data processing pipeline with S3 integration: @@ -455,7 +455,7 @@ if __name__ == "__main__": sys.exit(P.main(sys.argv)) ``` -### 3. Image Processing Pipeline +### 3. Image Processing Pipeline {#image-processing-pipeline} Example of an image processing pipeline: @@ -522,9 +522,9 @@ if __name__ == "__main__": sys.exit(P.main(sys.argv)) ``` -## Best Practices +## Best Practices {#best-practices} -### 1. Resource Management +### 1. Resource Management {#resource-management} ```python @transform("*.bam", suffix(".bam"), ".sorted.bam") @@ -550,7 +550,7 @@ def sort_bam(infile, outfile): P.run(statement) ``` -### 2. Error Handling +### 2. Error Handling {#error-handling} ```python @transform("*.txt", suffix(".txt"), ".processed") @@ -571,7 +571,7 @@ def robust_processing(infile, outfile): P.cleanup_tmpdir() ``` -### 3. Configuration Management +### 3. Configuration Management {#configuration-management} ```yaml # pipeline.yml - Example configuration @@ -611,7 +611,7 @@ s3: max_concurrency: 10 ``` -## Running the Examples +## Running the Examples {#running-the-examples} 1. **Setup Configuration** ```bash diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index 3205874..b97c81f 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -2,7 +2,7 @@ The following sections describe how to install the `cgatcore` framework. -## Conda installation +## Conda installation {#conda-installation} The preferred method of installation is using Conda. If you do not have Conda installed, you can install it using [Miniconda](https://conda.io/miniconda.html) or [Anaconda](https://www.anaconda.com/download/#macos). @@ -12,7 +12,7 @@ The preferred method of installation is using Conda. If you do not have Conda in conda install -c conda-forge -c bioconda cgatcore ``` -### Prerequisites +### Prerequisites {#prerequisites} Before installing `cgatcore`, ensure that you have the following prerequisites: @@ -20,13 +20,13 @@ Before installing `cgatcore`, ensure that you have the following prerequisites: - **Python**: Version 3.6 or higher - **Conda**: Recommended for dependency management -### Troubleshooting +### Troubleshooting {#troubleshooting} - **Conda Issues**: If you encounter issues with Conda, ensure that the Bioconda and Conda-Forge channels are added and prioritized correctly. - **Pip Dependencies**: When using pip, manually install any missing dependencies listed in the error messages. - **Script Errors**: If the installation script fails, check the script's output for error messages and ensure all prerequisites are met. -### Verification +### Verification {#verification} After installation, verify the installation by running: @@ -41,7 +41,7 @@ print(cgatcore.__version__) This should display the installed version of `cgatcore`. -## Pip installation +## Pip installation {#pip-installation} We recommend installation through Conda because it manages dependencies automatically. However, `cgatcore` is generally lightweight and can also be installed using the `pip` package manager. Note that you may need to manually install other dependencies as needed: @@ -49,7 +49,7 @@ We recommend installation through Conda because it manages dependencies automati pip install cgatcore ``` -## Automated installation +## Automated installation {#automated-installation} The preferred method to install `cgatcore` is using Conda. However, we have also created a Bash installation script, which uses [Conda](https://conda.io/docs/) under the hood. @@ -78,7 +78,7 @@ conda activate cgat-c The installation script will place everything under the specified location. The aim of the script is to provide a portable installation that does not interfere with existing software environments. As a result, you will have a dedicated Conda environment that can be activated as needed to work with `cgatcore`. -## Manual installation +## Manual installation {#manual-installation} To obtain the latest code, check it out from the public Git repository and activate it: @@ -94,7 +94,7 @@ To update to the latest version, simply pull the latest changes: git pull ``` -## Installing additional software +## Installing additional software {#installing-additional-software} When building your own workflows, we recommend using Conda to install software into your environment where possible. This ensures compatibility and ease of installation. @@ -105,7 +105,7 @@ conda search conda install ``` -## Accessing the libdrmaa shared library +## Accessing the libdrmaa shared library {#accessing-libdrmaa} You may also need access to the `libdrmaa.so.1.0` C library, which can often be installed as part of the `libdrmaa-dev` package on most Unix systems. Once installed, you may need to specify the location of the DRMAA library if it is not in a default library path. Set the `DRMAA_LIBRARY_PATH` environment variable to point to the library location. diff --git a/docs/getting_started/tutorial.md b/docs/getting_started/tutorial.md index fb8479f..3426385 100644 --- a/docs/getting_started/tutorial.md +++ b/docs/getting_started/tutorial.md @@ -8,9 +8,9 @@ The aim of this pipeline is to perform pseudoalignment using `kallisto`. The pip The `cgat-showcase` pipeline highlights some of the functionality of `cgat-core`. Additionally, more advanced workflows for next-generation sequencing analysis are available in the [cgat-flow repository](https://github.com/cgat-developers/cgat-flow). -## Tutorial start +## Tutorial start {#tutorial-start} -### Step 1: Download the tutorial data +### Step 1: Download the tutorial data {#download-data} Create a new directory, navigate to it, and download the test data: @@ -21,7 +21,7 @@ wget https://www.cgat.org/downloads/public/showcase/showcase_test_data.tar.gz tar -zxvf showcase_test_data.tar.gz ``` -### Step 2: Generate a configuration YAML file +### Step 2: Generate a configuration YAML file {#generate-config} Navigate to the test data directory and generate a configuration file for the pipeline: @@ -38,7 +38,7 @@ python /path/to/file/pipeline_transdiffexpres.py config This will generate a `pipeline.yml` file containing configuration parameters that can be used to modify the pipeline output. For this tutorial, you do not need to modify the parameters to run the pipeline. In the [Modify Config](#modify-config) section below, you will find details on how to adjust the config file to change the pipeline's output. -### Step 3: Run the pipeline +### Step 3: Run the pipeline {#run-pipeline} To run the pipeline, execute the following command in the directory containing the `pipeline.yml` file: @@ -56,17 +56,17 @@ cgatshowcase --help This will start the pipeline execution. Monitor the output for any errors or warnings. -### Step 4: Review Results +### Step 4: Review Results {#review-results} Once the pipeline completes, review the output files generated in the `showcase_test_data` directory. These files contain the results of the pseudoalignment. -### Troubleshooting +### Troubleshooting {#troubleshooting} - **Common Issues**: If you encounter errors during execution, ensure that all dependencies are installed and paths are correctly set. - **Logs**: Check the log files generated during the pipeline run for detailed error messages. - **Support**: For further assistance, refer to the [CGAT-core documentation](https://cgat-core.readthedocs.io/en/latest/) or raise an issue on our [GitHub repository](https://github.com/cgat-developers/cgat-core/issues). -### Step 5: Generate a report +### Step 5: Generate a report {#generate-report} The final step is to generate a report to display the output of the pipeline. We recommend using `MultiQC` for generating reports from commonly used bioinformatics tools (such as mappers and pseudoaligners) and `Rmarkdown` for generating custom reports. @@ -78,9 +78,9 @@ cgatshowcase transdiffexprs make build_report -v 5 --no-cluster This will generate a `MultiQC` report in the folder `MultiQC_report.dir/` and an `Rmarkdown` report in `R_report.dir/`. -## Core Concepts +## Core Concepts {#core-concepts} -### Pipeline Structure +### Pipeline Structure {#pipeline-structure} A CGAT pipeline typically consists of: 1. **Tasks**: Individual processing steps @@ -88,7 +88,7 @@ A CGAT pipeline typically consists of: 3. **Configuration**: Pipeline settings 4. **Execution**: Running the pipeline -### Task Types +### Task Types {#task-types} 1. **@transform**: One-to-one file transformation ```python @@ -111,7 +111,7 @@ def split_file(infile, outfiles): pass ``` -### Resource Management +### Resource Management {#resource-management} Control resource allocation: ```python @@ -126,7 +126,7 @@ def sort_bam(infile, outfile): P.run(statement) ``` -### Error Handling +### Error Handling {#error-handling} Implement robust error handling: ```python @@ -137,9 +137,9 @@ except P.PipelineError as e: raise ``` -## Advanced Topics +## Advanced Topics {#advanced-topics} -### 1. Pipeline Parameters +### 1. Pipeline Parameters {#pipeline-parameters} Access configuration parameters: ```python @@ -150,7 +150,7 @@ threads = PARAMS.get("threads", 1) input_dir = PARAMS["input_dir"] ``` -### 2. Logging +### 2. Logging {#logging} Use the logging system: ```python @@ -164,7 +164,7 @@ L.warning("Low memory condition") L.error("Task failed: %s" % e) ``` -### 3. Temporary Files +### 3. Temporary Files {#temporary-files} Manage temporary files: ```python @@ -180,9 +180,9 @@ def sort_bam(infile, outfile): P.run(statement) ``` -## Best Practices +## Best Practices {#best-practices} -### Code Organization +### Code Organization {#code-organization} #### 1. Task Structure - Use meaningful task names @@ -202,7 +202,7 @@ def sort_bam(infile, outfile): - Include usage examples - Maintain a clear README -### Resource Management +### Resource Management {#resource-management-best-practices} #### 1. Memory Usage - Set appropriate memory limits @@ -263,7 +263,7 @@ def sort_with_temp(infile, outfile): P.cleanup_tmpdir() ``` -### Error Handling +### Error Handling {#error-handling-best-practices} #### 1. Task Failures - Implement proper error checking @@ -334,7 +334,7 @@ def process_with_logging(infile, outfile): raise ``` -### Pipeline Configuration +### Pipeline Configuration {#pipeline-configuration} #### 1. Parameter Management - Use configuration files @@ -398,7 +398,7 @@ def test_pipeline(): assert check_output_validity("expected_output.txt") ``` -### Troubleshooting +### Troubleshooting {#troubleshooting-best-practices} If you encounter issues: @@ -425,7 +425,7 @@ For more detailed information, see: - [Cluster Configuration](../pipeline_modules/cluster.md) - [Error Handling](../pipeline_modules/execution.md) -## Next Steps +## Next Steps {#next-steps} - Review the [Examples](examples.md) section - Learn about [Cluster Configuration](../pipeline_modules/cluster.md) @@ -433,6 +433,6 @@ For more detailed information, see: For more advanced topics, see the [Pipeline Modules](../pipeline_modules/overview.md) documentation. -## Conclusion +## Conclusion {#conclusion} This completes the tutorial for running the `transdiffexprs` pipeline for `cgat-showcase`. We hope you find it as useful as we do for writing workflows in Python. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index b3cf1bf..91480fe 100644 --- a/docs/index.md +++ b/docs/index.md @@ -176,7 +176,7 @@ By leveraging these modules and decorators, you can build powerful, scalable, an ## Quick Links - [Getting Started](getting_started/installation.md) -- [Building a Workflow](defining_workflow/writing_workflow.md) +- [Building a Workflow](defining_workflow/writing_workflows.md) - [Pipeline Modules Overview](pipeline_modules/overview.md) - [S3 Integration](s3_integration/s3_pipeline.md) - [Working with Remote Files](remote/s3.md) diff --git a/docs/s3_integration/configuring_s3.md b/docs/s3_integration/configuring_s3.md index 5bd05c8..c39193f 100644 --- a/docs/s3_integration/configuring_s3.md +++ b/docs/s3_integration/configuring_s3.md @@ -213,7 +213,6 @@ logging.getLogger('botocore').setLevel(logging.DEBUG) - Use bucket policies - Enable access logging -For more information, see: +For more examples of using S3 in your pipelines, see the [S3 Pipeline Examples](s3_pipeline.md#examples) section. - [AWS S3 Documentation](https://docs.aws.amazon.com/s3/) -- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) -- [CGAT Pipeline Examples](examples.md) \ No newline at end of file +- [Boto3 Documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) \ No newline at end of file