From c26a7d0bd9d857f4d1bacebe094c33539783248d Mon Sep 17 00:00:00 2001 From: Hendrik Cannoodt Date: Tue, 30 Jan 2024 16:32:29 +0100 Subject: [PATCH] Add bgzip (#13) * Quick conversion from snakemake wrapper * reorder the fields a bit and expand the info field * improve implementation of regular script and test script * Add extra arguments * Fix late issues/remarks after PR was already merged * update PR# * Update src/bgzip/config.vsh.yaml Co-authored-by: Robrecht Cannoodt --------- Co-authored-by: Robrecht Cannoodt --- CHANGELOG.md | 2 + src/bgzip/config.vsh.yaml | 123 ++++++++++++++++++++++++++++++++++ src/bgzip/help.txt | 22 ++++++ src/bgzip/test_data/script.sh | 10 +++ src/bgzip/test_data/test.vcf | 23 +++++++ 5 files changed, 180 insertions(+) create mode 100644 src/bgzip/config.vsh.yaml create mode 100644 src/bgzip/help.txt create mode 100644 src/bgzip/test_data/script.sh create mode 100644 src/bgzip/test_data/test.vcf diff --git a/CHANGELOG.md b/CHANGELOG.md index 63f69365..d7262a74 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ * `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3). +* `bgzip`: Add bgzip functionality to compress and decompress files (PR #13). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/bgzip/config.vsh.yaml b/src/bgzip/config.vsh.yaml new file mode 100644 index 00000000..159c33e8 --- /dev/null +++ b/src/bgzip/config.vsh.yaml @@ -0,0 +1,123 @@ +functionality: + name: bgzip + description: Block compression/decompression utility + info: + homepage: https://www.htslib.org/ + documentation: https://www.htslib.org/doc/bgzip.html + repository: https://github.com/samtools/htslib + licence: MIT + requirements: + commands: [ bgzip ] + argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + direction: input + description: file to be compressed or decompressed + required: true + - name: Outputs + arguments: + - name: --output + type: file + direction: output + description: compressed or decompressed output + required: true + - name: --index_name + alternatives: -I + type: file + direction: output + description: name of BGZF index file [file.gz.gzi] + - name: Arguments + arguments: + - name: offset + alternatives: -b + type: integer + description: decompress at virtual file pointer (0-based uncompressed offset) + - name: --decompress + alternatives: -d + type: boolean_true + description: decompress the input file + - name: --rebgzip + alternatives: -g + type: boolean_true + description: use an index file to bgzip a file + - name: --index + alternatives: -i + type: boolean_true + description: compress and create BGZF index + - name: --compress_level + alternatives: -l + type: integer + description: compression level to use when compressing; 0 to 9, or -1 for default [-1] + min: -1 + max: 9 + - name: --reindex + alternatives: -r + type: boolean_true + description: (re)index the output file + - name: --size + alternatives: -s + type: integer + description: decompress INT bytes (uncompressed size) + min: 0 + - name: --test + alternatives: -t + type: boolean_true + description: test integrity of compressed file + - name: --binary + type: boolean_true + description: Don't align blocks with text lines + resources: + - type: bash_script + text: | + [[ "$par_decompress" == "false" ]] && unset par_decompress + [[ "$par_rebgzip" == "false" ]] && unset par_rebgzip + [[ "$par_index" == "false" ]] && unset par_index + [[ "$par_reindex" == "false" ]] && unset par_reindex + [[ "$par_test" == "false" ]] && unset par_test + [[ "$par_binary" == "false" ]] && unset par_binary + bgzip -c \ + ${meta_cpus:+--threads "${meta_cpus}"} \ + ${par_decompress:+-d} \ + ${par_rebgzip:+-g} \ + ${par_index:+-i} \ + ${par_index_name:+-I "${par_index_name}"} \ + ${par_compress_level:+-l "${par_compress_level}"} \ + ${par_reindex:+-r} \ + ${par_size:+-s "${par_size}"} \ + ${par_test:+-t} \ + "$par_input" > "$par_output" + test_resources: + - type: bash_script + text: | + set -e + + "$meta_executable" --input "$meta_resources_dir/test_data/test.vcf" --output "test.vcf.gz" + + echo ">> Checking output of compressing" + [ ! -f "test.vcf.gz" ] && echo "Output file test.vcf.gz does not exist" && exit 1 + + "$meta_executable" --input "test.vcf.gz" --output "test.vcf" --decompress + + echo ">> Checking output of decompressing" + [ ! -f "test.vcf" ] && echo "Output file test.vcf does not exist" && exit 1 + + echo ">> Checking original and decompressed files are the same" + set +e + cmp --silent -- "$meta_resources_dir/test_data/test.vcf" "test.vcf" + [ $? -ne 0 ] && echo "files are different" && exit 1 + set -e + + echo "> Test successful" + - type: file + path: test_data + +platforms: + - type: docker + image: quay.io/biocontainers/htslib:1.19--h81da01d_0 + setup: + - type: docker + run: | + bgzip -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/bgzip: "\1"/' > /var/software_versions.txt + - type: nextflow \ No newline at end of file diff --git a/src/bgzip/help.txt b/src/bgzip/help.txt new file mode 100644 index 00000000..d4012efd --- /dev/null +++ b/src/bgzip/help.txt @@ -0,0 +1,22 @@ +```bash +bgzip -h +``` + +Version: 1.19 +Usage: bgzip [OPTIONS] [FILE] ... +Options: + -b, --offset INT decompress at virtual file pointer (0-based uncompressed offset) + -c, --stdout write on standard output, keep original files unchanged + -d, --decompress decompress + -f, --force overwrite files without asking + -g, --rebgzip use an index file to bgzip a file + -h, --help give this help + -i, --index compress and create BGZF index + -I, --index-name FILE name of BGZF index file [file.gz.gzi] + -k, --keep don't delete input files during operation + -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1] + -r, --reindex (re)index compressed file + -s, --size INT decompress INT bytes (uncompressed size) + -t, --test test integrity of compressed file + --binary Don't align blocks with text lines + -@, --threads INT number of compression threads to use [1] diff --git a/src/bgzip/test_data/script.sh b/src/bgzip/test_data/script.sh new file mode 100644 index 00000000..c9114473 --- /dev/null +++ b/src/bgzip/test_data/script.sh @@ -0,0 +1,10 @@ +# bgzip test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/bgzip/test. + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/bgzip/test/* src/bgzip/test_data + diff --git a/src/bgzip/test_data/test.vcf b/src/bgzip/test_data/test.vcf new file mode 100644 index 00000000..11b5400e --- /dev/null +++ b/src/bgzip/test_data/test.vcf @@ -0,0 +1,23 @@ +##fileformat=VCFv4.0 +##fileDate=20090805 +##source=https://www.internationalgenome.org/wiki/Analysis/vcf4.0/ +##reference=1000GenomesPilot-NCBI36 +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3