galaxyproject · bgruening · May 29, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/tools/cosg/.shed.yml b/tools/cosg/.shed.yml
@@ -0,0 +1,11 @@
+name: cosg
+owner: iuc
+description: "Marker gene identification for single-cell sequencing data using COSG."
+homepage_url: https://github.com/genecell/COSG
+long_description: |
+    Accurate and fast cell marker gene identification with COSG. COSG is a cosine similarity-based method for more accurate and scalable marker gene identification.
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/cosg/
+type: unrestricted
+categories:
+- Transcriptomics
+- Sequence Analysis
diff --git a/tools/cosg/cosg.xml b/tools/cosg/cosg.xml
@@ -0,0 +1,202 @@
+<tool id="cosg" name="COSG" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
+    <description>Cell marker gene identification</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <expand macro="version_command"/>
+    <command detect_errors="exit_code"><![CDATA[
+@CMD@
+      ]]></command>
+    <configfiles>
+        <configfile name="script_file"><![CDATA[
+@CMD_imports@
+@CMD_read_inputs@
+
+#if $method_options.groups != 'all'
+    #set $method_options.groups=[$groups]
+#end if
+
+cosg.cosg(adata,
+        groupby='$method_options.groupby',
+        groups='$method_options.groups',
+        n_genes_user=$method_options.n_genes_user,
+        mu=$advanced_options.mu,   
+        remove_lowly_expressed=$advanced_options.filter_expression.remove_lowly_expressed,
+        #if $advanced_options.filter_expression.remove_lowly_expressed == "True"
+        expressed_pct=$advanced_options.filter_expression.expressed_pct,  
+        #end ifs
+        key_added='$advanced_options.key_added',
+        use_raw=$advanced_options.layer_selection.use_raw,
+        #if $advanced_options.layer_selection.use_raw == "False"
+        #if $advanced_options.layer_selection.layer
+        layer='$advanced_options.layer_selection.layer',
+        #end if
+        #end if
+        reference='$advanced_options.reference'
+        )
+
+df=pd.DataFrame(adata.uns['cosg']['names']).T
+df.to_csv('marker.tsv', sep='\t', index=True)
+
+@CMD_anndata_write_outputs@
+]]></configfile>
+    </configfiles>
+    <inputs>
+        <expand macro="inputs_anndata"/>
+        <section name="method_options" title="Method Options" expanded="true">
+            <param argument="groupby" type="text" value="" optional="false" label="The key of the cell groups in .obs"/>
+            <param argument="groups" type="text" value="all" optional="false" label="Subset of cell groups" help="e.g. 'g1','g2','g3'."/>
+            <param argument="n_genes_user" type="integer" value="50" min="1" label="The number of genes that appear in the returned tables"/>
+        </section>
+        <section name="advanced_options" title="Advanced Options">
+            <param argument="mu" type="float" value="1.0" min="0.0" max="1.0" label="The penalty restricting marker genes expressing in non-target cell groups" help="Larger value represents more strict restrictions. mu should be >= 0, and by default, mu = 1."/>
+            <conditional name="filter_expression">
+                <param name="remove_lowly_expressed" type="select" label="Remove lowly expressed genes" help="If yes, genes that express a percentage of target cells smaller than a specific value (`expressed_pct`) are not considered as marker genes for the target cells.">
+                    <option value="False">No</option>
+                    <option value="True">Yes</option>
+                </param>
+                <when value="False"/>
+                <when value="True">
+                    <param argument="expressed_pct" type="float" value="0.1" min="0.01" max="1.0" label="Percentage of target cells" help="Genes that express a percentage of target cells smaller than a specific value (`expressed_pct`) are not considered as marker genes for the target cells."/>
+                </when>
+            </conditional>
+            <param argument="key_added" type="text" value="cosg" optional="false" label="The key in adata.uns information is saved to.">
+                <validator type="empty_field"/>
+            </param>
+            <conditional name="layer_selection">
+                <param name="use_raw" type="select" label="Use raw attribute of adata if present to perform tests on." help="If use_raw is set to True then adata.raw.X if it exists.">
+                    <option value="False">No</option>
+                    <option value="True">Yes</option>
+                </param>
+                <when value="False">
+                    <param argument="layer" type="text" value="" label="Key from adata.layers whose value will be used to perform tests on." help="If empty then adata.X will be used. If use_raw is set to True then adata.raw.X. If layers specified then use adata.layers[layer]."/>
+                </when>
+                <when value="True"/>
+            </conditional>
+            <param argument="reference" type="text" value="rest" optional="false" label="If a group identifier, compare with respect to this group." help=" If you use the keyword 'rest', compare each group to the union of the rest of the group.">
+                <validator type="empty_field"/>
+            </param>
+        </section>
+        <expand macro="inputs_common_advanced"/>
+    </inputs>
+    <outputs>
+        <expand macro="anndata_outputs"/>
+        <data name="marker_out" format="tabular" from_work_dir="marker.tsv" label="${tool.name} on ${on_string}: Markers"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <!-- test 1 -->
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced.h5ad" />
+            <param name="groupby" value="bulk_labels"/>
+            <section name="advanced_common">
+                <param name="show_log" value="true" />
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="cosg.cosg"/>
+                    <has_text_matching expression="groupby='bulk_labels'"/>
+                    <has_text_matching expression="groups='all'"/>
+                    <has_text_matching expression="n_genes_user=50"/>
+                    <has_text_matching expression="mu=1.0"/>
+                    <has_text_matching expression="remove_lowly_expressed=False"/>
+                    <has_text_matching expression="key_added='cosg'"/>
+                    <has_text_matching expression="use_raw=False"/>
+                    <has_text_matching expression="reference='rest'"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs, var, uns" />
+                </assert_contents>
+            </output>
+            <output name="marker_out" file="marker_1.tsv" ftype="tabular" compare="sim_size">
+                <assert_contents>
+                    <has_n_columns n="51" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="3">
+            <!-- test 2 -->
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced.h5ad" />
+            <param name="groupby" value="louvain"/>
+            <param name="remove_lowly_expressed" value="True" />
+            <section name="advanced_common">
+                <param name="show_log" value="true" />
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="cosg.cosg"/>
+                    <has_text_matching expression="groupby='louvain'"/>
+                    <has_text_matching expression="groups='all'"/>
+                    <has_text_matching expression="n_genes_user=50"/>
+                    <has_text_matching expression="mu=1.0"/>
+                    <has_text_matching expression="remove_lowly_expressed=True"/>
+                    <has_text_matching expression="expressed_pct=0.1"/>
+                    <has_text_matching expression="key_added='cosg'"/>
+                    <has_text_matching expression="use_raw=False"/>
+                    <has_text_matching expression="reference='rest'"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_2.h5ad" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs, var, uns" />
+                </assert_contents>
+            </output>
+            <output name="marker_out" file="marker_2.tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="51" />
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="3">
+            <!-- test 3 -->
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced.h5ad" />
+            <param name="groupby" value="bulk_labels"/>
+            <param name="use_raw" value="True"/>
+            <section name="advanced_common">
+                <param name="show_log" value="true" />
+            </section>
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="cosg.cosg"/>
+                    <has_text_matching expression="groupby='bulk_labels'"/>
+                    <has_text_matching expression="groups='all'"/>
+                    <has_text_matching expression="n_genes_user=50"/>
+                    <has_text_matching expression="mu=1.0"/>
+                    <has_text_matching expression="remove_lowly_expressed=False"/>
+                    <has_text_matching expression="key_added='cosg'"/>
+                    <has_text_matching expression="use_raw=True"/>
+                    <has_text_matching expression="reference='rest'"/>
+                </assert_contents>
+            </output>
+            <output name="anndata_out" file="cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_3.h5ad" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs, var, uns" />
+                </assert_contents>
+            </output>
+            <output name="marker_out" file="marker_3.tsv" ftype="tabular">
+                <assert_contents>
+                    <has_n_columns n="51" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+Marker gene identification for single-cell sequencing data using COSG.
+============================================================================================================
+
+Accurate and fast cell marker gene identification with COSG
+
+COSG is a cosine similarity-based method for more accurate and scalable marker gene identification.
+
+- COSG is a general method for cell marker gene identification across different data modalities, e.g., scRNA-seq, scATAC-seq and spatially resolved transcriptome data. 
+- Marker genes or genomic regions identified by COSG are more indicative and with greater cell-type specificity.
+- COSG is ultrafast for large-scale datasets, and is capable of identifying marker genes for one million cells in less than two minutes.
+
+Here is the R version for COSG, and the Python version is hosted in https://github.com/genecell/COSG.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
diff --git a/tools/cosg/macros.xml b/tools/cosg/macros.xml
@@ -0,0 +1,85 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@profile@">22.05</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">cosg</requirement>
+            <requirement type="package" version="1.9.8">scanpy</requirement>
+            <requirement type="package" version="1.5.3">pandas</requirement>
+            <requirement type="package" version="3.7">matplotlib</requirement>
+            <requirement type="package" version="0.12.2">seaborn</requirement>
+            <yield />
+        </requirements>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/" />
+        </creator>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/bib/bbab579</citation>
+            <citation type="doi">10.1093/gigascience/giaa102</citation>
+        </citations>
+    </xml>
+    <xml name="version_command">
+        <version_command><![CDATA[python -c "import cosg;import importlib.metadata;print('%s' % importlib.metadata.version('cosg'))"]]></version_command>
+    </xml>
+    <token name="@CMD@"><![CDATA[
+cp '$adata' 'anndata.h5ad' &&
+cat '$script_file' > '$hidden_output' &&
+python '$script_file' >> '$hidden_output' &&
+ls . >> '$hidden_output' &&
+touch 'anndata_info.txt' &&
+cat 'anndata_info.txt' @CMD_prettify_stdout@
+    ]]>
+    </token>
+    <token name="@CMD_imports@"><![CDATA[
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import cosg
+    ]]>
+    </token>
+    <xml name="sanitize_query" token_validinitial="string.printable">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <remove value="&apos;" />
+            </valid>
+       </sanitizer>
+    </xml>
+    <xml name="sanitize_vectors" token_validinitial="string.digits">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <add value=","/>
+            </valid>
+        </sanitizer>
+    </xml>
+    <xml name="inputs_anndata">
+        <param name="adata" type="data" format="h5ad" label="Annotated data matrix"/>
+    </xml>
+    <token name="@CMD_read_inputs@"><![CDATA[
+adata = sc.read_h5ad('anndata.h5ad')
+]]>
+    </token>
+    <xml name="inputs_common_advanced">
+        <section name="advanced_common" title="Advanced Output Options" expanded="false">
+            <param name="show_log" type="boolean" checked="false" label="Output Log?" />
+        </section>
+    </xml>
+    <xml name="anndata_outputs">
+        <data name="anndata_out" format="h5ad" from_work_dir="anndata.h5ad" label="${tool.name} on ${on_string}: Annotated data matrix"/>
+        <data name="hidden_output" format="txt" label="Log file" >
+            <filter>advanced_common['show_log']</filter>
+        </data>
+    </xml>
+    <token name="@CMD_anndata_write_outputs@"><![CDATA[
+adata.write_h5ad('anndata.h5ad')
+with open('anndata_info.txt','w', encoding='utf-8') as ainfo:
+    print(adata, file=ainfo)
+]]>
+    </token>
+    <token name="@CMD_prettify_stdout@"><![CDATA[ | sed -r '1 s|AnnData object with (.+) = (.*)\s*|\1: \2|g' | sed "s|'||g"  | sed -r 's|^\s*(.*):\s(.*)|[\1]\n-    \2|g' | sed 's|, |\n-    |g'
+    ]]></token>
+</macros>
diff --git a/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad b/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_1.h5ad
diff --git a/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_2.h5ad b/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_2.h5ad
diff --git a/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_3.h5ad b/tools/cosg/test-data/cosg.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_3.h5ad
diff --git a/tools/cosg/test-data/marker_1.tsv b/tools/cosg/test-data/marker_1.tsv
@@ -0,0 +1,11 @@
+	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49
+CD14+ Monocyte	PILRA	PSAP	CD68	TMEM176B	FTL	NPC2	LST1	FCGR3A	FCER1G	CEBPB	FCN1	SERPINA1	OAZ1	CFD	FTH1	HCK	AIF1	SAT1	CTSS	S100A11	MS4A7	TYROBP	COTL1	STXBP2	RP11-290F20.3	S100A4	IFITM2	SPI1	DUSP1	SESN2	IFITM3	MPP1	GALE	CORO1B	RP11-390E23.6	VIMP	RSBN1L-AS1	CHD4	CFP	GSTP1	PFN1	FCGRT	ADTRP	ARHGDIB	AMICA1	HLA-DRB5	CST3	GRN	HLA-DPA1	SSR3
+CD19+ B	TNFRSF13B	CD79B	SMARCB1	PNOC	CCDC50	AL928768.3	BANK1	MS4A1	CD79A	ISG20	IGLL5	TNFRSF17	KIAA0125	TPD52	PEBP1	FKBP11	CCDC132	SUB1	POU2AF1	MZB1	PTPRCAP	UBE2J1	BLK	SPIB	DERL3	FAM63B	MPHOSPH9	IGJ	FCRLA	XBP1	NCF1	SSR3	CD52	TSHZ2	PDLIM1	VIMP	SSR4	S1PR4	SELL	HMGA1	NUCB2	JUN	CD27	ARHGDIB	GYPC	CALR	ADTRP	BTG1	EXOG	RARRES3
+CD34+	PRSS57	C19orf77	SPINK2	RP11-620J15.3	SNHG7	CYTL1	EGFL7	NGFRAP1	SOX4	NFE2	EGR1	RP3-467N11.1	H1FX	CDK6	SERPINB1	SPINT2	HMGA1	IL1B	NUCB2	RPLP0	IGFBP7	RPLP1	ATXN7L3B	RPS3	C1orf228	KIAA0125	RPL3	SYPL1	CD63	LDHB	SEPT1	JUN	FAM101B	PRKCQ-AS1	MATK	PEBP1	SELL	ITM2A	SSR3	SPON2	XBP1	UBE2J1	VIMP	GYPC	STK17A	STMN1	VIM	MZB1	HOPX	CD99
+CD4+/CD25 T Reg	IL32	SPOCK2	ACTG1	CD2	CD3D	GPR171	ARHGDIB	ACOX1	MAL	SIT1	GIMAP4	AES	CD52	SEPT1	TMSB10	LAT	STMN1	LINC00402	CD27	TSHZ2	S1PR4	CD3E	PFN1	CD99	AQP3	PTPRCAP	CD3G	LY9	LCK	CD247	S100A4	CCR7	TTC39C	CORO1B	MPHOSPH9	FYB	RPSA	FLT3LG	B2M	GIMAP7	PRKCQ-AS1	SELL	BTG1	CCDC132	GYPC	DENND2D	LDHB	IL7R	ITM2A	RPLP0
+CD4+/CD45RA+/CD25- Naive T	EAF2	GNG7	SSR4	CALR	DERL3	MANF	IGJ	XBP1	ATXN7L3B	SSR3	UBE2J1	CD79A	MZB1	RP3-467N11.1	TNFRSF17	NCF1	CDK6	SUB1	POU2AF1	AL928768.3	FKBP11	VIMP	GYPC	JUN	CD27	PEBP1	SMARCB1	FLT3LG	RPLP1	RPLP0	CCDC50	ISG20	IGLL5	HCST	GSTP1	GPX1	CD52	VIM	PTPRCAP	FCGRT	CD74	B2M	RPL3	CYTL1	SPINK2	PRSS57	C19orf77	RP11-620J15.3	FAM101B	CCDC132
+CD4+/CD45RO+ Memory	RNF138	NOSIP	IFITM1	LCK	RARRES3	ALOX5AP	FAM63B	RAB3IP	GZMK	CD3G	SEPT1	LDHB	SELL	CD3D	EXOG	RPSA	CD247	AES	CD52	TMSB10	NUCB2	DENND2D	RPL3	RPLP1	ACTG1	FYB	GIMAP7	CORO1B	LY9	CD7	PFN1	RPS3	GYPC	CD2	ARHGDIB	IL32	RPLP0	CD99	CD3E	GIMAP4	HCST	B2M	LAT	ISG20	ITM2A	FKBP11	SERPINB1	STK17A	CCR7	PTPRCAP
+CD56+ NK	CST7	SPON2	HOPX	GNLY	NKG7	CTSW	KLRC2	CD7	MATK	PCIF1	CLIC3	FGFBP2	SYPL1	GZMB	C9orf142	PRF1	CD247	HCST	GZMA	GZMH	STMN1	ALOX5AP	CD63	CD99	IGFBP7	GZMM	CCL5	B2M	DENND2D	GIMAP7	RARRES3	SIT1	IFITM1	PFN1	EXOG	XBP1	IFITM2	GIMAP4	VIMP	STK17A	LCK	GZMK	SEPT1	SSR3	CD8A	CD3G	SPOCK2	RPS3	LDHB	IL32
+CD8+ Cytotoxic T	FAM101B	ADTRP	GZMK	HCST	LAT	EGR1	CD8B	CCL5	RPL3	LINC00402	FGFBP2	GZMM	RPS3	CD3E	GYPC	DENND2D	C9orf142	GZMA	SEPT1	JUN	FYB	CD8A	SELL	ALOX5AP	CD3G	STK17A	AQP3	C1orf228	CD3D	HOPX	NKG7	CD2	NGFRAP1	RPLP1	RPSA	CCR7	IL7R	SPON2	PRF1	RARRES3	PRKCQ-AS1	FKBP11	MANF	CTSW	GNLY	CD27	LDHB	MAL	LTB	RPLP0
+CD8+/CD45RA+ Naive Cytotoxic	RP11-291B21.2	CD8A	CD8B	RSBN1L-AS1	GIMAP5	GZMM	GALE	CCR7	STK17A	RAB3IP	GZMH	GIMAP7	CD3E	C1orf228	LCK	CCL5	PEBP1	CD27	GYPC	LDHB	RNF34	CD99	CD3G	PFN1	IL7R	CD2	C9orf142	TMSB10	NGFRAP1	S1PR4	ITM2A	CD7	RPS3	IL32	FYB	IFITM1	CD52	LAT	GIMAP4	MAL	STMN1	NOSIP	RARRES3	SPOCK2	ACTG1	PRF1	CD3D	RPLP1	SELL	GZMA
+Dendritic	HLA-DQB1	CST3	HLA-DRB1	HLA-DQA2	HLA-DQA1	LYZ	HLA-DPB1	HLA-DPA1	HLA-DMA	HLA-DRA	VIM	CD74	ALDH2	FCER1A	GPX1	HLA-DRB5	LGALS2	MNDA	FCGRT	GRN	HLA-DMB	FOS	CPVL	CLEC10A	AMICA1	CFP	LY86	GSTP1	RP11-473M20.7	IL1B	GSN	SPINT2	CCDC163P	IGFBP7	EXOG	DUSP1	CD63	COTL1	FTH1	SPI1	TYROBP	SPIB	S100A11	OAZ1	CTSS	CCDC50	AIF1	SERPINB1	TMSB10	PCIF1