First pass at generating mermaid diagrams for all workflows

galaxyproject · Nov 30, 2024 · cb06c58 · cb06c58
1 parent 7fa5211
commit cb06c58
Show file tree

Hide file tree

Showing 83 changed files with 3,807 additions and 0 deletions.
diff --git a/scripts/create_mermaid.py b/scripts/create_mermaid.py
@@ -0,0 +1,94 @@
+import argparse
+import os
+import json
+import re
+from typing import Literal
+
+STEP_TYPE_TO_SHAPE = {
+    "data_input": "@{ shape: doc }",
+    "data_collection_input": "@{ shape: docs }",
+    "parameter_input": "@{ shape: lean-l }",
+    "tool": "@{ shape: process }",
+    "subworkflow": "@{ shape: subprocess }",
+}
+
+
+def escape_mermaid_string(input_string: str) -> str:
+    # List of characters to escape in Mermaid diagrams
+    special_chars = r'[\[\]\{\}\|\<\>\\"`\*_#\+\(\)\\]'
+    return re.sub(special_chars, lambda match: f"\\{match.group(0)}", input_string)
+
+
+def step_to_mermaid_item(
+    step_type: Literal[
+        "parameter_input", "data_input", "data_collection_input", "tool", "subworkflow"
+    ],
+    step_label: str,
+):
+    step_label_anchor = f'["{step_label}"]'
+    shape = STEP_TYPE_TO_SHAPE.get(step_type, "")
+    return f"{step_label_anchor}{shape}"
+
+
+def workflow_to_mermaid(workflow_json):
+    """
+    Converts a Galaxy workflow JSON to a Mermaid flowchart diagram.
+
+    Args:
+        workflow_json: The JSON representation of the Galaxy workflow.
+
+    Returns:
+        A string representing the Mermaid flowchart diagram.
+    """
+
+    mermaid_diagram = "graph LR\n"
+
+    # Create a mapping of step IDs to their labels
+    id_step_labels = {
+        step["id"]: escape_mermaid_string(
+            step["label"] or step["name"] or step["content_id"] or step["id"]
+        )
+        for step in workflow_json["steps"].values()
+    }
+
+    # Iterate through each step and its connections
+    for step_id, step in workflow_json["steps"].items():
+        step_label = id_step_labels.get(int(step_id))
+        mermaid_diagram += (
+            f'{step_id}{step_to_mermaid_item(step["type"], step_label)}\n'
+        )
+        for input_connection in step.get("input_connections", {}).values():
+            if not isinstance(input_connection, list):
+                input_connection = [input_connection]
+                for ic in input_connection:
+                    mermaid_diagram += f"{ic['id']} --> {step_id}\n"
+
+    return mermaid_diagram
+
+
+def walk_directory(directory):
+    """
+    Walk directory and call workflow_to_mermaid on each discovered .ga file.
+    """
+    for root, _, paths in os.walk(directory):
+        for path in paths:
+            if path.endswith(".ga"):
+                file_path = os.path.join(root, path)
+                with open(file_path, "r") as f:
+                    workflow_data = json.load(f)
+                    mermaid_diagram = workflow_to_mermaid(workflow_data)
+
+                mmd_path = f"{os.path.splitext(file_path)[0]}_diagram.md"
+                with open(mmd_path, "w") as f:
+                    f.write(f"```mermaid\n{mermaid_diagram}\n```")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Process files in a directory")
+    parser.add_argument("directory", type=str, help="Path to the input directory")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    walk_directory(args.directory)
diff --git a/...bly-v2/Assembly-Hifi-HiC-phasing-VGP4/Assembly-Hifi-HiC-phasing-VGP4_diagram.md b/...bly-v2/Assembly-Hifi-HiC-phasing-VGP4/Assembly-Hifi-HiC-phasing-VGP4_diagram.md
@@ -0,0 +1,116 @@
+```mermaid
+graph LR
+0["Pacbio Reads Collection"]@{ shape: docs }
+1["HiC forward reads"]@{ shape: doc }
+2["HiC reverse reads"]@{ shape: doc }
+3["Genomescope Summary"]@{ shape: doc }
+4["Meryl Database"]@{ shape: doc }
+5["Database for Busco Lineage"]@{ shape: lean-l }
+6["Lineage"]@{ shape: lean-l }
+7["Name for Haplotype 1"]@{ shape: lean-l }
+8["Name for Haplotype 2"]@{ shape: lean-l }
+9["Bits for bloom filter"]@{ shape: lean-l }
+10["SAK input file"]@{ shape: doc }
+11["Homozygous Read Coverage"]@{ shape: lean-l }
+12["Genomescope Model Parameters"]@{ shape: doc }
+13["Cutadapt"]@{ shape: process }
+0 --> 13
+14["Search in textfiles"]@{ shape: process }
+3 --> 14
+15["Compute"]@{ shape: process }
+12 --> 15
+16["MultiQC"]@{ shape: process }
+13 --> 16
+17["Replace Text"]@{ shape: process }
+14 --> 17
+18["Cut"]@{ shape: process }
+15 --> 18
+19["Convert"]@{ shape: process }
+17 --> 19
+20["Estimated homozygous read coverage"]@{ shape: process }
+18 --> 20
+21["Cut"]@{ shape: process }
+19 --> 21
+22["Homozygous read coverage for Hifiasm"]@{ shape: process }
+11 --> 22
+20 --> 22
+23["Estimated genome size"]@{ shape: process }
+21 --> 23
+24["Hifiasm"]@{ shape: process }
+22 --> 24
+9 --> 24
+1 --> 24
+2 --> 24
+13 --> 24
+25["Raw Unitig Image"]@{ shape: process }
+24 --> 25
+26["gfastats"]@{ shape: process }
+24 --> 26
+23 --> 26
+27["gfastats"]@{ shape: process }
+24 --> 27
+23 --> 27
+28["gfastats"]@{ shape: process }
+24 --> 28
+29["gfastats"]@{ shape: process }
+24 --> 29
+30["gfastats"]@{ shape: process }
+24 --> 30
+31["gfastats"]@{ shape: process }
+24 --> 31
+32["gfastats"]@{ shape: process }
+24 --> 32
+10 --> 32
+33["gfastats"]@{ shape: process }
+24 --> 33
+10 --> 33
+34["Text reformatting"]@{ shape: process }
+26 --> 34
+35["Text reformatting"]@{ shape: process }
+27 --> 35
+36["Data Prep Hap2"]@{ shape: subprocess }
+28 --> 36
+37["Data Prep Hap1"]@{ shape: subprocess }
+30 --> 37
+38["Text transformation"]@{ shape: process }
+32 --> 38
+39["Text transformation"]@{ shape: process }
+33 --> 39
+40["Join two Datasets"]@{ shape: process }
+35 --> 40
+34 --> 40
+41["Plot Data"]@{ shape: subprocess }
+36 --> 41
+8 --> 41
+7 --> 41
+37 --> 41
+42["Busco"]@{ shape: process }
+38 --> 42
+5 --> 42
+6 --> 42
+43["Merqury"]@{ shape: process }
+39 --> 43
+38 --> 43
+4 --> 43
+44["Busco"]@{ shape: process }
+39 --> 44
+5 --> 44
+6 --> 44
+45["Advanced Cut"]@{ shape: process }
+40 --> 45
+46["output\_merqury.spectra-cn.fl"]@{ shape: process }
+43 --> 46
+47["output\_merqury.spectra-asm.fl"]@{ shape: process }
+43 --> 47
+48["merqury\_qv"]@{ shape: process }
+43 --> 48
+49["output\_merqury.assembly\_01.spectra-cn.fl"]@{ shape: process }
+43 --> 49
+50["merqury\_stats"]@{ shape: process }
+43 --> 50
+51["output\_merqury.assembly\_02.spectra-cn.fl"]@{ shape: process }
+43 --> 51
+52["Replace"]@{ shape: process }
+45 --> 52
+
+```
diff --git a/...y-v2/Assembly-Hifi-Trio-phasing-VGP5/Assembly-Hifi-Trio-phasing-VGP5_diagram.md b/...y-v2/Assembly-Hifi-Trio-phasing-VGP5/Assembly-Hifi-Trio-phasing-VGP5_diagram.md
@@ -0,0 +1,120 @@
+```mermaid
+graph LR
+0["Pacbio Reads Collection : child"]@{ shape: docs }
+1["Paternal Illumina reads \(hap1\)"]@{ shape: docs }
+2["Maternal Illumina reads \(hap2\)"]@{ shape: docs }
+3["Meryl Database : Child"]@{ shape: doc }
+4["Hapmer Database : Paternal"]@{ shape: doc }
+5["Hapmer Database : Maternal"]@{ shape: doc }
+6["Bits for bloom filter"]@{ shape: lean-l }
+7["Database for Busco Lineage"]@{ shape: lean-l }
+8["Lineage"]@{ shape: lean-l }
+9["Homozygous Read Coverage"]@{ shape: lean-l }
+10["Genomescope Model Parameters"]@{ shape: doc }
+11["Genomescope Summary"]@{ shape: doc }
+12["Utilize homology information to correct trio-phasing errors"]@{ shape: lean-l }
+13["SAK input file \(Optional\)"]@{ shape: doc }
+14["Name for Haplotype 1"]@{ shape: lean-l }
+15["Name for Haplotype 2"]@{ shape: lean-l }
+16["Cutadapt"]@{ shape: process }
+0 --> 16
+17["Compute"]@{ shape: process }
+10 --> 17
+18["Search in textfiles"]@{ shape: process }
+11 --> 18
+19["MultiQC"]@{ shape: process }
+16 --> 19
+20["Cut"]@{ shape: process }
+17 --> 20
+21["Replace Text"]@{ shape: process }
+18 --> 21
+22["Parse parameter value"]@{ shape: process }
+20 --> 22
+23["Convert"]@{ shape: process }
+21 --> 23
+24["Homozygous read coverage for Hifiasm"]@{ shape: process }
+9 --> 24
+22 --> 24
+25["Cut"]@{ shape: process }
+23 --> 25
+26["Hifiasm"]@{ shape: process }
+24 --> 26
+6 --> 26
+16 --> 26
+12 --> 26
+1 --> 26
+2 --> 26
+27["Estimated genome size"]@{ shape: process }
+25 --> 27
+28["gfastats"]@{ shape: process }
+26 --> 28
+13 --> 28
+29["gfastats"]@{ shape: process }
+26 --> 29
+13 --> 29
+30["Raw Unitig Image"]@{ shape: process }
+26 --> 30
+31["gfastats"]@{ shape: process }
+26 --> 31
+13 --> 31
+32["gfastats"]@{ shape: process }
+26 --> 32
+13 --> 32
+33["gfastats"]@{ shape: process }
+26 --> 33
+34["gfastats"]@{ shape: process }
+26 --> 34
+35["gfastats"]@{ shape: process }
+26 --> 35
+27 --> 35
+36["gfastats"]@{ shape: process }
+26 --> 36
+27 --> 36
+37["Busco"]@{ shape: process }
+31 --> 37
+7 --> 37
+8 --> 37
+38["Busco"]@{ shape: process }
+32 --> 38
+7 --> 38
+8 --> 38
+39["Merqury"]@{ shape: process }
+31 --> 39
+32 --> 39
+3 --> 39
+5 --> 39
+4 --> 39
+40["Data prep Hap1"]@{ shape: subprocess }
+33 --> 40
+41["Data Prep Hap2"]@{ shape: subprocess }
+34 --> 41
+42["Text reformatting"]@{ shape: process }
+35 --> 42
+43["Text reformatting"]@{ shape: process }
+36 --> 43
+44["merqury\_qv"]@{ shape: process }
+39 --> 44
+45["output\_merqury.spectra-cn.fl"]@{ shape: process }
+39 --> 45
+46["output\_merqury.spectra-asm.fl"]@{ shape: process }
+39 --> 46
+47["output\_merqury.assembly\_01.spectra-cn.fl"]@{ shape: process }
+39 --> 47
+48["output\_merqury.assembly\_02.spectra-cn.fl"]@{ shape: process }
+39 --> 48
+49["merqury\_stats"]@{ shape: process }
+39 --> 49
+50["Plots"]@{ shape: subprocess }
+41 --> 50
+15 --> 50
+14 --> 50
+40 --> 50
+51["Join two Datasets"]@{ shape: process }
+43 --> 51
+42 --> 51
+52["Advanced Cut"]@{ shape: process }
+51 --> 52
+53["Replace"]@{ shape: process }
+52 --> 53
+
+```