diff --git a/.gitignore b/.gitignore index 01de547a..c6161e94 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,3 @@ out scripts/helper_scripts/parser/output scripts/helper_scripts/parser/src/META-INF .idea/ - diff --git a/scripts/build_binaries.sh b/scripts/build_binaries.sh new file mode 100755 index 00000000..adfb5c93 --- /dev/null +++ b/scripts/build_binaries.sh @@ -0,0 +1,19 @@ +#! /usr/bin/env bash + +# All references to an external script should be relative to the location of this script. +# See: http://mywiki.wooledge.org/BashFAQ/028 +CURRENT_LOCATION="${BASH_SOURCE%/*}" + +checkdep() { + which $1 > /dev/null 2>&1 || hash $1 > /dev/null 2>&1 || { + echo "Unipept database builder requires ${2:-$1} to be installed." >&2 + exit 1 + } +} + +checkdep cargo "Rust toolchain" + +# Build binaries and copy them to the /helper_scripts folder +cd $CURRENT_LOCATION/helper_scripts/unipept-database-rs +cargo build --release +find ./target/release -maxdepth 1 -type f -executable -exec cp {} .. \; diff --git a/scripts/build_database.sh b/scripts/build_database.sh index a44d13a0..08f40152 100755 --- a/scripts/build_database.sh +++ b/scripts/build_database.sh @@ -36,7 +36,7 @@ Required parameters: - swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz - * OUTPUT_DIR: Directory in which the tsv.gz-files that are produced by this script will be stored. + * OUTPUT_DIR: Directory in which the tsv.lz4-files that are produced by this script will be stored. Options: * -h @@ -71,14 +71,12 @@ Dependencies: This script requires some non-standard dependencies to be installed before it can be used. This is a list of these items (which can normally be installed through your package manager): - * maven - * node-js * curl * pv * pigz - * java * uuidgen * parallel + * lz4 END } @@ -245,12 +243,10 @@ checkDirectoryAndCreate "$4" ### Check that all dependencies required for this script to function are met. checkdep curl -checkdep java -checkdep mvn "Maven" checkdep uuidgen checkdep pv -checkdep node checkdep pigz +checkdep lz4 ### Default configuration for this script PEPTIDE_MIN_LENGTH=5 # What is the minimum length (inclusive) for tryptic peptides?" @@ -258,9 +254,11 @@ PEPTIDE_MAX_LENGTH=50 # What is the maximum length (inclusive) for tryptic pepti TABDIR="$OUTPUT_DIR" # Where should I store the final TSV files (large, single-write)? INTDIR="$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" # Where should I store intermediate TSV files (large, single-write, multiple-read? KMER_LENGTH=9 # What is the length (k) of the K-mer peptides? -JAVA_MEM="2g" # How much memory should Java use? CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command should I use? -CMD_GZIP="gzip -" # Which pipe compression command should I use? +CMD_GZIP="pigz -" # Which pipe compression command should I use for .gz files? +CMD_ZCAT="pigz -dc" # Which decompression command should I use for .gz files? +CMD_LZ4="lz4 -c" # Which pipe compression command should I use for .lz4 files? +CMD_LZ4CAT="lz4 -dc" # Which decompression command should I use for .lz4 files? ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez? TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip" @@ -301,7 +299,22 @@ guz() { fifo="$(uuidgen)-$(basename "$1")" mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" - { zcat "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & + { $CMD_ZCAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & +} + +lz() { + fifo="$(uuidgen)-$(basename "$1")" + mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" + echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" + mkdir -p "$(dirname "$1")" + { $CMD_LZ4 - < "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" > "$1" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & +} + +luz() { + fifo="$(uuidgen)-$(basename "$1")" + mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" + echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" + { $CMD_LZ4CAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null & } have() { @@ -350,10 +363,10 @@ create_taxon_tables() { -e 's/parvorder/no rank/' "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp" mkdir -p "$OUTPUT_DIR" - java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/NamesNodes2TaxonsLineages.jar" \ + $CURRENT_LOCATION/helper_scripts/taxons-lineages \ --names "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/names.dmp" --nodes "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp" \ - --taxons "$(gz "$OUTPUT_DIR/taxons.tsv.gz")" \ - --lineages "$(gz "$OUTPUT_DIR/lineages.tsv.gz")" + --taxons "$(lz "$OUTPUT_DIR/taxons.tsv.lz4")" \ + --lineages "$(lz "$OUTPUT_DIR/lineages.tsv.lz4")" rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/names.dmp" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp" log "Finished creating the taxon tables." @@ -368,7 +381,8 @@ download_and_convert_all_sources() { DB_TYPES_ARRAY=($DB_TYPES) DB_SOURCES_ARRAY=($DB_SOURCES) - IFS="$OLDIFS" + # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines + IFS=$'\n' while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]] do @@ -396,7 +410,7 @@ download_and_convert_all_sources() { reportProgress -1 "Downloading database index for $DB_TYPE." 3 - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | zcat | java -jar "$CURRENT_LOCATION/helper_scripts/XmlToTabConverter.jar" 5 50 "$DB_TYPE" "$VERBOSE" | node "$CURRENT_LOCATION/helper_scripts/WriteToChunk.js" "$DB_INDEX_OUTPUT" "$VERBOSE" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") @@ -407,7 +421,7 @@ download_and_convert_all_sources() { for CHUNK in $CHUNKS do echo "Compressing $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE" - pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | pigz > "$CHUNK.gz" + pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | lz4 -c > "$CHUNK.lz4" # Remove the chunk that was just compressed rm "$CHUNK" CHUNK_IDX=$((CHUNK_IDX + 1)) @@ -440,7 +454,7 @@ download_and_convert_all_sources() { SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')" - curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | zcat | java -jar "$CURRENT_LOCATION/helper_scripts/XmlToTabConverter.jar" 5 50 "$DB_TYPE" "$VERBOSE" | node "$CURRENT_LOCATION/helper_scripts/WriteToChunk.js" "$DB_INDEX_OUTPUT" "$VERBOSE" + curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT" # Now, compress the different chunks CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk") @@ -451,7 +465,7 @@ download_and_convert_all_sources() { for CHUNK in $CHUNKS do echo "Compressing $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE" - pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | pigz > "$CHUNK.gz" + pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | lz4 -c > "$CHUNK.lz4" # Remove the chunk that was just compressed rm "$CHUNK" CHUNK_IDX=$((CHUNK_IDX + 1)) @@ -465,6 +479,8 @@ download_and_convert_all_sources() { IDX=$((IDX + 1)) done + + IFS="$OLDIFS" } filter_sources_by_taxa() { @@ -491,176 +507,142 @@ filter_sources_by_taxa() { mkdir -p "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter" - $CURRENT_LOCATION/helper_scripts/filter_taxa.sh "$TAXA" "$DB_INDEX_OUTPUT" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter" "$OUTPUT_DIR/lineages.tsv.gz" + $CURRENT_LOCATION/helper_scripts/filter_taxa.sh "$TAXA" "$DB_INDEX_OUTPUT" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter" "$OUTPUT_DIR/lineages.tsv.lz4" IDX=$((IDX + 1)) done } create_most_tables() { - have "$OUTPUT_DIR/taxons.tsv.gz" || return + have "$OUTPUT_DIR/taxons.tsv.lz4" || return log "Started calculation of most tables." reportProgress "-1" "Started building main database tables." 5 mkdir -p "$OUTPUT_DIR" "$INTDIR" - if [ $VERBOSE = "true" ] - then - $VERBOSE_FLAG="--verbose" - fi - - cat - | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/TaxonsUniprots2Tables.jar" \ + cat - | $CURRENT_LOCATION/helper_scripts/taxons-uniprots-tables \ --peptide-min "$PEPTIDE_MIN_LENGTH" \ --peptide-max "$PEPTIDE_MAX_LENGTH" \ - --taxons "$(guz "$OUTPUT_DIR/taxons.tsv.gz")" \ - --peptides "$(gz "$INTDIR/peptides.tsv.gz")" \ - --uniprot-entries "$(gz "$OUTPUT_DIR/uniprot_entries.tsv.gz")" \ - --ec "$(gz "$OUTPUT_DIR/ec_cross_references.tsv.gz")" \ - --go "$(gz "$OUTPUT_DIR/go_cross_references.tsv.gz")" \ - --interpro "$(gz "$OUTPUT_DIR/interpro_cross_references.tsv.gz")" \ - $VERBOSE_FLAG - - log "Finished calculation of most tables with status $?" + --taxons "$(luz "$OUTPUT_DIR/taxons.tsv.lz4")" \ + --peptides "$(lz "$INTDIR/peptides-out.tsv.lz4")" \ + --uniprot-entries "$(lz "$OUTPUT_DIR/uniprot_entries.tsv.lz4")" \ + --ec "$(lz "$OUTPUT_DIR/ec_cross_references.tsv.lz4")" \ + --go "$(lz "$OUTPUT_DIR/go_cross_references.tsv.lz4")" \ + --interpro "$(lz "$OUTPUT_DIR/interpro_cross_references.tsv.lz4")" + + log "Started sorting peptides table" + + $CMD_LZ4CAT $INTDIR/peptides-out.tsv.lz4 \ + | LC_ALL=C $CMD_SORT -k2 \ + | $CMD_LZ4 > $INTDIR/peptides-equalized.tsv.lz4 + + rm $INTDIR/peptides-out.tsv.lz4 + log "Finished calculation of most tables with status $?" } create_tables_and_filter() { filter_sources_by_taxa | create_most_tables } -join_equalized_pepts_and_entries() { - echo "Test if files for joining peptides are available." - have "$INTDIR/peptides.tsv.gz" "$OUTPUT_DIR/uniprot_entries.tsv.gz" || return - log "Started the joining of equalized peptides and uniprot entries." - mkfifo "peptides_eq" "entries_eq" - zcat "$INTDIR/peptides.tsv.gz" | gawk '{ printf("%012d\t%s\n", $4, $2) }' > "peptides_eq" & - zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $4) }' > "entries_eq" & - join -t ' ' -o '1.2,2.2' -j 1 "peptides_eq" "entries_eq" \ - | LC_ALL=C $CMD_SORT -k1 \ - | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" - rm "peptides_eq" "entries_eq" - log "Finished the joining of equalized peptides and uniprot entries with status $?." -} +number_sequences() { + have "$INTDIR/peptides-equalized.tsv.lz4" || return + log "Started the numbering of sequences." -join_original_pepts_and_entries() { - have "$INTDIR/peptides.tsv.gz" "$OUTPUT_DIR/uniprot_entries.tsv.gz" || return - log "Started the joining of original peptides and uniprot entries." - mkfifo "peptides_orig" "entries_orig" - zcat "$INTDIR/peptides.tsv.gz" | gawk '{ printf("%012d\t%s\n", $4, $3) }' > "peptides_orig" & - zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $4) }' > "entries_orig" & - join -t ' ' -o '1.2,2.2' -j 1 "peptides_orig" "entries_orig" \ - | LC_ALL=C $CMD_SORT -k1 \ - | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_original.tsv.gz" - rm "peptides_orig" "entries_orig" - log "Finished the joining of original peptides and uniprot entries with status $?." -} + mkfifo "p_eq" + mkfifo "p_or" + $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 3 | sort | uniq > "p_or" & + $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 2 | uniq > "p_eq" & + + sort -u -m "p_or" "p_eq" | cat -n \ + | sed 's/^ *//' | $CMD_LZ4 - > "$INTDIR/sequences.tsv.lz4" + + rm "p_eq" "p_or" -number_sequences() { - have "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" "$INTDIR/aa_sequence_taxon_original.tsv.gz" || return - log "Started the numbering of sequences." - mkfifo "equalized" "original" - zcat "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" | cut -f1 | uniq > "equalized" & - zcat "$INTDIR/aa_sequence_taxon_original.tsv.gz" | cut -f1 | uniq > "original" & - LC_ALL=C $CMD_SORT -m "equalized" "original" | uniq | cat -n \ - | sed 's/^ *//' | $CMD_GZIP - > "$INTDIR/sequences.tsv.gz" - rm "equalized" "original" log "Finished the numbering of sequences with status $?." } +substitute_aas() { + have "$INTDIR/peptides-equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4" + + log "Started the substitution of equalized AA's by ID's for the peptides." + $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 \ + | join -t ' ' -o '1.1,2.1,1.3,1.4,1.5,1.6' -1 2 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \ + | $CMD_LZ4 - > "$INTDIR/peptides_by_equalized.tsv.lz4" + + rm "$INTDIR/peptides-equalized.tsv.lz4" + log "Finished the substitution of equalized AA's by ID's for the peptides with status $?." + + log "Started the substitution of original AA's by ID's for the peptides." + $CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" \ + | LC_ALL=C $CMD_SORT -k 3b,3 \ + | join -t ' ' -o '1.1,1.2,2.1,1.4,1.5,1.6' -1 3 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \ + | $CMD_LZ4 - > "$INTDIR/peptides_by_original.tsv.lz4" + + log "Finished the substitution of original AA's by ID's for the peptides with status $?." +} calculate_equalized_lcas() { - have "$INTDIR/sequences.tsv.gz" "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" "$OUTPUT_DIR/lineages.tsv.gz" || return - log "Started the calculation of equalized LCA's (after substituting AA's by ID's)." - join -t ' ' -o '1.1,2.2' -1 2 -2 1 \ - "$(guz "$INTDIR/sequences.tsv.gz")" \ - "$(guz "$INTDIR/aa_sequence_taxon_equalized.tsv.gz")" \ - | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/LineagesSequencesTaxons2LCAs.jar" "$(guz "$OUTPUT_DIR/lineages.tsv.gz")" \ - | $CMD_GZIP - > "$INTDIR/LCAs_equalized.tsv.gz" + have "$INTDIR/peptides_by_equalized.tsv.lz4" || return + log "Started the calculation of equalized LCA's." + $CMD_LZ4CAT $INTDIR/peptides_by_equalized.tsv.lz4 | cut -f 2,6 \ + | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \ + | $CMD_LZ4 - > "$INTDIR/LCAs_equalized.tsv.lz4" log "Finished the calculation of equalized LCA's (after substituting AA's by ID's) with status $?." } calculate_original_lcas() { - have "$INTDIR/sequences.tsv.gz" "$INTDIR/aa_sequence_taxon_original.tsv.gz" "$OUTPUT_DIR/lineages.tsv.gz" || return - log "Started the calculation of original LCA's (after substituting AA's by ID's)." - join -t ' ' -o '1.1,2.2' -1 2 -2 1 \ - "$(guz "$INTDIR/sequences.tsv.gz")" \ - "$(guz "$INTDIR/aa_sequence_taxon_original.tsv.gz")" \ - | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/LineagesSequencesTaxons2LCAs.jar" "$(guz "$OUTPUT_DIR/lineages.tsv.gz")" \ - | $CMD_GZIP - > "$INTDIR/LCAs_original.tsv.gz" + have "$INTDIR/peptides_by_original.tsv.lz4" || return + log "Started the calculation of original LCA's" + $CMD_LZ4CAT $INTDIR/peptides_by_original.tsv.lz4 | cut -f 3,6 \ + | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \ + | $CMD_LZ4 - > "$INTDIR/LCAs_original.tsv.lz4" log "Finished the calculation of original LCA's (after substituting AA's by ID's) with status $?." } -substitute_equalized_aas() { - have "$INTDIR/peptides.tsv.gz" "$INTDIR/sequences.tsv.gz" || return - log "Started the substitution of equalized AA's by ID's for the peptides." - zcat "$INTDIR/peptides.tsv.gz" \ - | LC_ALL=C $CMD_SORT -k 2b,2 \ - | join -t ' ' -o '1.1,2.1,1.3,1.4,1.5' -1 2 -2 2 - "$(guz "$INTDIR/sequences.tsv.gz")" \ - | $CMD_GZIP - > "$INTDIR/peptides_by_equalized.tsv.gz" - log "Finished the substitution of equalized AA's by ID's for the peptides with status $?." -} - - calculate_equalized_fas() { - have "$INTDIR/peptides_by_equalized.tsv.gz" || return + have "$INTDIR/peptides_by_equalized.tsv.lz4" || return log "Started the calculation of equalized FA's." mkfifo "peptides_eq" - zcat "$INTDIR/peptides_by_equalized.tsv.gz" | cut -f2,5 > "peptides_eq" & - node "$CURRENT_LOCATION/helper_scripts/FunctionalAnalysisPeptides.js" "peptides_eq" "$(gz "$INTDIR/FAs_equalized.tsv.gz")" + $CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" | cut -f2,5 > "peptides_eq" & + $CURRENT_LOCATION/helper_scripts/functional-analysis -i "peptides_eq" -o "$(lz "$INTDIR/FAs_equalized.tsv.lz4")" rm "peptides_eq" log "Finished the calculation of equalized FA's with status $?." } -substitute_original_aas() { - have "$INTDIR/peptides_by_equalized.tsv.gz" "$INTDIR/sequences.tsv.gz" || return - log "Started the substitution of original AA's by ID's for the peptides." - zcat "$INTDIR/peptides_by_equalized.tsv.gz" \ - | LC_ALL=C $CMD_SORT -k 3b,3 \ - | join -t ' ' -o '1.1,1.2,2.1,1.4,1.5' -1 3 -2 2 - "$(guz "$INTDIR/sequences.tsv.gz")" \ - | $CMD_GZIP - > "$INTDIR/peptides_by_original.tsv.gz" - log "Finished the substitution of equalized AA's by ID's for the peptides with status $?." -} - calculate_original_fas() { - have "$INTDIR/peptides_by_original.tsv.gz" || return + have "$INTDIR/peptides_by_original.tsv.lz4" || return log "Started the calculation of original FA's." mkfifo "peptides_orig" - zcat "$INTDIR/peptides_by_original.tsv.gz" | cut -f3,5 > "peptides_orig" & - node "$CURRENT_LOCATION/helper_scripts/FunctionalAnalysisPeptides.js" "peptides_orig" "$(gz "$INTDIR/FAs_original.tsv.gz")" + $CMD_LZ4CAT "$INTDIR/peptides_by_original.tsv.lz4" | cut -f3,5 > "peptides_orig" & + $CURRENT_LOCATION/helper_scripts/functional-analysis -i "peptides_orig" -o "$(lz "$INTDIR/FAs_original.tsv.lz4")" rm "peptides_orig" log "Finished the calculation of original FA's." } -sort_peptides() { - have "$INTDIR/peptides_by_original.tsv.gz" || return - log "Started sorting the peptides table." - mkdir -p "$OUTPUT_DIR" - zcat "$INTDIR/peptides_by_original.tsv.gz" \ - | LC_ALL=C $CMD_SORT -n \ - | $CMD_GZIP - > "$OUTPUT_DIR/peptides.tsv.gz" - log "Finished sorting the peptides table." -} create_sequence_table() { - have "$INTDIR/LCAs_original.tsv.gz" "$INTDIR/LCAs_equalized.tsv.gz" "$INTDIR/FAs_original.tsv.gz" "$INTDIR/FAs_equalized.tsv.gz" "$INTDIR/sequences.tsv.gz" || return + have "$INTDIR/LCAs_original.tsv.lz4" "$INTDIR/LCAs_equalized.tsv.lz4" "$INTDIR/FAs_original.tsv.lz4" "$INTDIR/FAs_equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4" || return log "Started the creation of the sequences table." mkdir -p "$OUTPUT_DIR" mkfifo "olcas" "elcas" "ofas" "efas" - zcat "$INTDIR/LCAs_original.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "olcas" & - zcat "$INTDIR/LCAs_equalized.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "elcas" & - zcat "$INTDIR/FAs_original.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "ofas" & - zcat "$INTDIR/FAs_equalized.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "efas" & - zcat "$INTDIR/sequences.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' \ + $CMD_LZ4CAT "$INTDIR/LCAs_original.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "olcas" & + $CMD_LZ4CAT "$INTDIR/LCAs_equalized.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "elcas" & + $CMD_LZ4CAT "$INTDIR/FAs_original.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "ofas" & + $CMD_LZ4CAT "$INTDIR/FAs_equalized.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "efas" & + $CMD_LZ4CAT "$INTDIR/sequences.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' \ | join --nocheck-order -a1 -e '\N' -t ' ' -o "1.1 1.2 2.2" - "olcas" \ | join --nocheck-order -a1 -e '\N' -t ' ' -o "1.1 1.2 1.3 2.2" - "elcas" \ | join --nocheck-order -a1 -e '\N' -t ' ' -o '1.1 1.2 1.3 1.4 2.2' - "ofas" \ | join --nocheck-order -a1 -e '\N' -t ' ' -o '1.1 1.2 1.3 1.4 1.5 2.2' - "efas" \ - | sed 's/^0*//' | $CMD_GZIP - > "$OUTPUT_DIR/sequences.tsv.gz" + | sed 's/^0*//' \ + | awk -F'\t' 'BEGIN {OFS="\t"} {gsub(/Z/, "K", $2); print}' \ + | $CMD_LZ4 - > "$OUTPUT_DIR/sequences.tsv.lz4" rm "olcas" "elcas" "ofas" "efas" log "Finished the creation of the sequences table." } @@ -679,7 +661,7 @@ fetch_ec_numbers() { /^DE/ { gsub(/.$/, "", $2) name = name $2 } END { print id, name }' - } | cat -n | sed 's/^ *//' | $CMD_GZIP - > "$OUTPUT_DIR/ec_numbers.tsv.gz" + } | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/ec_numbers.tsv.lz4" log "Finished creating EC numbers." } @@ -711,14 +693,14 @@ fetch_go_terms() { id++ } } - type = "" }' | $CMD_GZIP - > "$OUTPUT_DIR/go_terms.tsv.gz" + type = "" }' | $CMD_LZ4 - > "$OUTPUT_DIR/go_terms.tsv.lz4" log "Finished creating GO terms." } fetch_interpro_entries() { log "Started creating InterPro Entries." mkdir -p "$OUTPUT_DIR" - curl -s "$INTERPRO_URL" | grep '^IPR' | cat -n | sed 's/^ *//' | $CMD_GZIP - > "$OUTPUT_DIR/interpro_entries.tsv.gz" + curl -s "$INTERPRO_URL" | grep '^IPR' | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/interpro_entries.tsv.lz4" log "Finished creating InterPro Entries." } @@ -728,10 +710,10 @@ fetch_interpro_entries() { #dot: create_kmer_index -> kmer_index #dot: kmer_index [color="#f28e2b"] create_kmer_index() { - have "$OUTPUT_DIR/uniprot_entries.tsv.gz" "$OUTPUT_DIR/taxons.tsv.gz" || return + have "$OUTPUT_DIR/uniprot_entries.tsv.lz4" "$OUTPUT_DIR/taxons.tsv.lz4" || return log "Started the construction of the $KMER_LENGTH-mer index." for PREFIX in A C D E F G H I K L M N P Q R S T V W Y; do - pv -N $PREFIX "$OUTPUT_DIR/uniprot_entries.tsv.gz" \ + pv -N $PREFIX "$OUTPUT_DIR/uniprot_entries.tsv.lz4" \ | gunzip \ | cut -f4,7 \ | grep "^[0-9]* [ACDEFGHIKLMNPQRSTVWY]*$" \ @@ -740,7 +722,7 @@ create_kmer_index() { | LC_ALL=C $CMD_SORT \ | sed "s/^/$PREFIX/" done \ - | umgap joinkmers "$(guz "$OUTPUT_DIR/taxons.tsv.gz")" \ + | umgap joinkmers "$(guz "$OUTPUT_DIR/taxons.tsv.lz4")" \ | cut -d' ' -f1,2 \ | umgap buildindex \ > "$OUTPUT_DIR/$KMER_LENGTH-mer.index" @@ -752,9 +734,9 @@ create_kmer_index() { #dot: create_tryptic_index -> tryptic_index #dot: tryptic_index [color="#f28e2b"] create_tryptic_index() { - have "$TABDIR/sequences.tsv.gz" || return + have "$TABDIR/sequences.tsv.lz4" || return log "Started the construction of the tryptic index." - pv "$TABDIR/sequences.tsv.gz" \ + pv "$TABDIR/sequences.tsv.lz4" \ | gunzip \ | cut -f2,3 \ | grep -v "\\N" \ @@ -771,43 +753,31 @@ database) download_and_convert_all_sources create_tables_and_filter echo "Created tables!" - join_equalized_pepts_and_entries & - pid1=$! - join_original_pepts_and_entries & - pid2=$! - wait $pid1 - wait $pid2 number_sequences - reportProgress "-1" "Calculating lowest common ancestors." 6 + substitute_aas + reportProgress "-1" "Calculating lowest common ancestors and functional annotations." 6 calculate_equalized_lcas & pid1=$! calculate_original_lcas & pid2=$! - wait $pid1 - wait $pid2 - rm "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" - rm "$INTDIR/aa_sequence_taxon_original.tsv.gz" - substitute_equalized_aas - rm "$INTDIR/peptides.tsv.gz" - substitute_original_aas - reportProgress "-1" "Calculating functional annotations." 7 calculate_equalized_fas & - pid1=$! + pid3=$! calculate_original_fas & - pid2=$! + pid4=$! wait $pid1 wait $pid2 - rm "$INTDIR/peptides_by_equalized.tsv.gz" - reportProgress "-1" "Sorting peptides." 8 - sort_peptides - rm "$INTDIR/peptides_by_original.tsv.gz" + wait $pid3 + wait $pid4 reportProgress "-1" "Creating sequence table." 9 create_sequence_table - rm "$INTDIR/LCAs_original.tsv.gz" - rm "$INTDIR/LCAs_equalized.tsv.gz" - rm "$INTDIR/FAs_original.tsv.gz" - rm "$INTDIR/FAs_equalized.tsv.gz" - rm "$INTDIR/sequences.tsv.gz" + rm "$INTDIR/LCAs_original.tsv.lz4" + rm "$INTDIR/LCAs_equalized.tsv.lz4" + rm "$INTDIR/FAs_original.tsv.lz4" + rm "$INTDIR/FAs_equalized.tsv.lz4" + rm "$INTDIR/sequences.tsv.lz4" + rm "$INTDIR/peptides_by_equalized.tsv.lz4" + # Use the original sort as the result + mv "$INTDIR/peptides_by_original.tsv.lz4" "$OUTPUT_DIR/peptides.tsv.lz4" reportProgress "-1" "Fetching EC numbers." 10 fetch_ec_numbers reportProgress "-1" "Fetching GO terms." 11 @@ -815,11 +785,11 @@ database) reportProgress "-1" "Fetching InterPro entries." 12 fetch_interpro_entries reportProgress "-1" "Computing database indices" 13 - ENTRIES=$(zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | wc -l) + ENTRIES=$($CMD_LZ4CAT "$OUTPUT_DIR/uniprot_entries.tsv.lz4" | wc -l) echo "Database contains: ##$ENTRIES##" ;; static-database) - if ! have "$TABDIR/taxons.tsv.gz"; then + if ! have "$TABDIR/taxons.tsv.lz4"; then create_taxon_tables fi fetch_ec_numbers @@ -830,10 +800,10 @@ kmer-index) checkdep pv checkdep umgap "umgap crate (for umgap buildindex)" - if ! have "$OUTPUT_DIR/taxons.tsv.gz"; then + if ! have "$OUTPUT_DIR/taxons.tsv.lz4"; then create_taxon_tables fi - if ! have "$OUTPUT_DIR/uniprot_entries.tsv.gz"; then + if ! have "$OUTPUT_DIR/uniprot_entries.tsv.lz4"; then download_and_convert_all_sources create_tables_and_filter fi @@ -843,22 +813,20 @@ tryptic-index) checkdep pv checkdep umgap "umgap crate (for umgap buildindex)" - if ! have "$TABDIR/taxons.tsv.gz"; then + if ! have "$TABDIR/taxons.tsv.lz4"; then create_taxon_tables fi - if ! have "$TABDIR/sequences.tsv.gz"; then + if ! have "$TABDIR/sequences.tsv.lz4"; then download_and_convert_all_sources create_tables_and_filter - join_equalized_pepts_and_entries - join_original_pepts_and_entries number_sequences + substitute_aas calculate_equalized_lcas calculate_original_lcas - substitute_equalized_aas calculate_equalized_fas - substitute_original_aas calculate_original_fas create_sequence_table + # TODO remove temp files fi create_tryptic_index ;; diff --git a/scripts/helper_scripts/.gitignore b/scripts/helper_scripts/.gitignore new file mode 100644 index 00000000..38df1f8a --- /dev/null +++ b/scripts/helper_scripts/.gitignore @@ -0,0 +1,9 @@ +# Ignore the compiled binaries that get moved here +dat-parser +functional-analysis +lcas +taxa-by-chunk +taxons-lineages +taxons-uniprots-tables +write-to-chunk +xml-parser diff --git a/scripts/helper_scripts/FunctionalAnalysisPeptides.js b/scripts/helper_scripts/FunctionalAnalysisPeptides.js deleted file mode 100755 index 2e635468..00000000 --- a/scripts/helper_scripts/FunctionalAnalysisPeptides.js +++ /dev/null @@ -1,73 +0,0 @@ -const readline = require('readline'); -const fs = require('fs'); -const start = new Date().getTime(); -const args = process.argv; -if (args.length !== 4) { - console.log("Please provide 2 parameters: input and output."); - process.exit(1); -} -const inputFile = args[2]; -const outputFile = args[3]; -const readInterface = readline.createInterface({ - input: fs.createReadStream(inputFile) -}); -const writer = fs.createWriteStream(outputFile); -let row = null; -let curPept = null; -let numProt = 0; -let numAnnotatedGO = 0; -let numAnnotatedEC = 0; -let numAnnotatedInterPro = 0; -let done = 0; -let m = new Map(); -readInterface.on('line', function (line) { - row = line.split("\t"); - if (row[0] !== curPept) { - if (curPept !== null) { - if (m.size !== 0) { - writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`); - } - } - m.clear(); - numProt = 0; - numAnnotatedGO = 0; - numAnnotatedEC = 0; - numAnnotatedInterPro = 0; - curPept = row[0]; - } - numProt++; - if (row.length > 1) { - const terms = row[1].split(";"); - let hasEC = false; - let hasGO = false; - let hasInterPro = false; - for (const term of terms) { - if (!term) { - continue; - } - if (term.startsWith("G")) { - hasGO = true; - } else if (term.startsWith("E")) { - hasEC = true; - } else { - hasInterPro = true; - } - m.set(term, (m.get(term) || 0) + 1); - } - numAnnotatedGO += hasGO ? 1 : 0; - numAnnotatedEC += hasEC ? 1 : 0; - numAnnotatedInterPro += hasInterPro ? 1 : 0; - } - done++; - if (done % 1000000 === 0) { - console.log("FA " + done + " rows"); - } -}); -readInterface.on('close', function () { - if (m.size !== 0) { - writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`); - } - writer.end(); - const end = new Date().getTime(); - console.log("Took " + (end - start) / 1000 + "s"); -}); diff --git a/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar b/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar deleted file mode 100755 index fee277c5..00000000 Binary files a/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar and /dev/null differ diff --git a/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar b/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar deleted file mode 100644 index 1f3b817d..00000000 Binary files a/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar and /dev/null differ diff --git a/scripts/helper_scripts/ParallelXmlToTab.js b/scripts/helper_scripts/ParallelXmlToTab.js deleted file mode 100755 index dd1557eb..00000000 --- a/scripts/helper_scripts/ParallelXmlToTab.js +++ /dev/null @@ -1,10 +0,0 @@ -const readline = require('readline'); -const fs = require('fs'); - -const rl = readline.createInterface({ - input: process.stdin -}); - -let buffer = ""; - - diff --git a/scripts/helper_scripts/TaxaByChunk.js b/scripts/helper_scripts/TaxaByChunk.js deleted file mode 100755 index fff59f75..00000000 --- a/scripts/helper_scripts/TaxaByChunk.js +++ /dev/null @@ -1,50 +0,0 @@ -/** - * This script looks for which taxa should be looked up in which chunk. The list of taxa that need to be looked up is - * read from stdin. A list of files, taxa (thus the taxa that need to be looked up in the corresponding file) are - * provided through stdout. - * - * The script requires two command line arguments: the folder in which all Unipept DB chunks are present and a - * temporary folder that can be used by the script to store temporary files. - */ - -const readline = require("readline"); -const fs = require("fs"); -const path = require("path"); - -const args = process.argv; - -if (args.length !== 4) { - console.error("This script expects exactly two parameters: unipept_db_chunk_folder temporary_folder"); - process.exit(1); -} - -const rl = readline.createInterface({ - input: process.stdin -}); - -const allTaxa = []; - -rl.on("line", (line) => { - allTaxa.push(parseInt(line.trim())); -}); - -// In this hook we should start to link input files with the taxa that need to be looked up in there. -rl.on("close", () => { - for (const file of fs.readdirSync(args[2])) { - const baseFile = path.basename(file); - if (baseFile.match(/unipept\..*\.gz/)) { - const range = baseFile.replace(/unipept\.|\.gz/g, '').split("-"); - const startRange = parseInt(range[0]); - const endRange = parseInt(range[1]); - - const matchedTaxa = allTaxa.filter(t => startRange <= t && t <= endRange); - - if (matchedTaxa && matchedTaxa.length > 0) { - fs.writeFileSync(path.join(args[3], baseFile + ".pattern"), matchedTaxa.map(t => "\t" + t + "$").join("\n")); - - console.log(path.join(args[3], baseFile + ".pattern")); - console.log(path.join(args[2], file)); - } - } - } -}); diff --git a/scripts/helper_scripts/TaxonsUniprots2Tables.jar b/scripts/helper_scripts/TaxonsUniprots2Tables.jar deleted file mode 100644 index bf9f13e8..00000000 Binary files a/scripts/helper_scripts/TaxonsUniprots2Tables.jar and /dev/null differ diff --git a/scripts/helper_scripts/WriteToChunk.js b/scripts/helper_scripts/WriteToChunk.js deleted file mode 100755 index 594ae4a5..00000000 --- a/scripts/helper_scripts/WriteToChunk.js +++ /dev/null @@ -1,49 +0,0 @@ -const readline = require("readline"); -const fs = require("fs"); -const path = require("path"); - -const outputDir = process.argv[2]; - -const verbose = process.argv[3] === "true"; - -const rl = readline.createInterface({ - input: process.stdin -}); - -const taxaBounds = [ - 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819, - 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149, - 1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, - 2706029, 10000000 -]; - -const fileObjects = [...Object.keys(taxaBounds)].slice(0, -1).map(idx => Number.parseInt(idx)).map( - idx => fs.createWriteStream(path.join(outputDir, `unipept.${taxaBounds[idx]}-${taxaBounds[idx + 1]}.chunk`)) -); - -let headerSkipped = false; - -rl.on("line", (line) => { - if (verbose) { - console.error("INFO VERBOSE: writing line to chunk: " + line); - } - - if (!headerSkipped) { - headerSkipped = true; - const writeStream = fs.createWriteStream(path.join(outputDir, 'db.header')); - writeStream.write(line + "\n"); - writeStream.close(); - return; - } - - const taxonId = Number.parseInt(line.split("\t")[8].trim()); - - let idx = 0; - while (taxonId > taxaBounds[idx]) { - idx++; - } - - fileObjects[idx - 1].write(line + "\n"); -}); - -rl.on("close", () => fileObjects.map(o => o.close())); diff --git a/scripts/helper_scripts/XmlToTabConverter.jar b/scripts/helper_scripts/XmlToTabConverter.jar deleted file mode 100644 index 734e2498..00000000 Binary files a/scripts/helper_scripts/XmlToTabConverter.jar and /dev/null differ diff --git a/scripts/helper_scripts/filter_taxa.sh b/scripts/helper_scripts/filter_taxa.sh index 017a512d..b9b4e76a 100755 --- a/scripts/helper_scripts/filter_taxa.sh +++ b/scripts/helper_scripts/filter_taxa.sh @@ -14,7 +14,7 @@ mkdir -p "$TMP_DIR" filter_taxa() { QUERY=$(echo "\s$1\s" | sed "s/,/\\\s\\\|\\\s/g") - RESULT=$(cat "$LINEAGE_ARCHIVE" | zcat | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',') + RESULT=$(lz4 -dc "$LINEAGE_ARCHIVE" | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',') echo "$RESULT" } @@ -23,16 +23,16 @@ then TAXA=$(filter_taxa "$TAXA") # This associative array maps a filename upon the taxa that should be queried within this file - QUERIES=( $(echo "$TAXA" | tr "," "\n" | node "$CURRENT_LOCATION/TaxaByChunk.js" "$DATABASE_INDEX" "$TMP_DIR") ) + QUERIES=( $(echo "$TAXA" | tr "," "\n" | $CURRENT_LOCATION/taxa-by-chunk --chunk-dir "$DATABASE_INDEX" --temp-dir "$TMP_DIR") ) if [[ ${#QUERIES[@]} -gt 0 ]] then - parallel --jobs 8 --max-args 2 "cat {2} | zcat | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}" + parallel --jobs 8 --max-args 2 "lz4 -dc {2} | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}" fi else # If the root ID has been passed to this script, we simply print out all database items (without filtering). - find "$DATABASE_INDEX" -name "*.chunk.gz" | xargs zcat + find "$DATABASE_INDEX" -name "*.chunk.lz4" -exec lz4 -mdc {} + fi # Remove temporary files diff --git a/scripts/helper_scripts/parser/pom.xml b/scripts/helper_scripts/parser/pom.xml deleted file mode 100644 index a8fa31d7..00000000 --- a/scripts/helper_scripts/parser/pom.xml +++ /dev/null @@ -1,107 +0,0 @@ - - 4.0.0 - unipept - unipept - 0.0.1-SNAPSHOT - - UTF-8 - - - - - ${basedir}/src/main/java - - - - - ${basedir}/src/test/java - - - ${basedir}/src/test/resources - - **/*.* - - - - - - maven-compiler-plugin - 3.1 - - 1.8 - 1.8 - - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - prepare-package - - copy-dependencies - - - ${project.build.directory}/lib - false - false - true - - - - - - org.apache.maven.plugins - maven-jar-plugin - 2.4 - - - - true - lib/ - - - - - - - - - junit - junit - 4.11 - - - com.beust - jcommander - 1.48 - - - javax.json - javax.json-api - 1.1 - - - - org.glassfish - javax.json - 1.1 - - - - - oracleReleases - Oracle Released Java Packages - http://download.oracle.com/maven - default - - - Unipept - https://github.ugent.be/bmesuere/unipept - The Unipept web application supports biodiversity analysis of large and complex metaproteome samples. - - https://github.ugent.be/bmesuere/unipept.git - - diff --git a/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF deleted file mode 100755 index d6c5d70b..00000000 --- a/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF +++ /dev/null @@ -1,3 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: tools.LineagesSequencesTaxons2LCAs - diff --git a/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF deleted file mode 100755 index 143dcc30..00000000 --- a/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF +++ /dev/null @@ -1,3 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: tools.NamesNodes2TaxonsLineages - diff --git a/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF deleted file mode 100755 index 1f0472ba..00000000 --- a/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF +++ /dev/null @@ -1,3 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: tools.TaxonsUniprots2Tables - diff --git a/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF deleted file mode 100755 index 87a3f9e2..00000000 --- a/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF +++ /dev/null @@ -1,3 +0,0 @@ -Manifest-Version: 1.0 -Main-Class: tools.XmlToTabConverter - diff --git a/scripts/helper_scripts/parser/src/storage/CSV.java b/scripts/helper_scripts/parser/src/storage/CSV.java deleted file mode 100755 index 1e5e9587..00000000 --- a/scripts/helper_scripts/parser/src/storage/CSV.java +++ /dev/null @@ -1,93 +0,0 @@ -package storage; - -import java.io.IOException; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; -import java.io.FileInputStream; -import java.io.FileOutputStream; - -public class CSV { - - private static final int MB4 = 4194304; - - public static class Reader { - private BufferedReader buffer; - - public Reader(String file) throws IOException { - buffer = new BufferedReader( - new InputStreamReader( - new FileInputStream(file) - ) - ); - } - - public String[] read() throws IOException { - String line = buffer.readLine(); - if(line == null) return null; - return line.split(" "); - } - - public void close() throws IOException { - buffer.close(); - } - } - - public static class Writer { - protected BufferedWriter buffer; - - public Writer(String file) throws IOException { - buffer = new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(file) - ), MB4 - ); - } - - public void write(String... values) throws IOException { - buffer.write(values[0]); - for(int i = 1; i < values.length; i++) { - buffer.write(" " + (values[i] == null ? "\\N" : values[i])); - } - buffer.newLine(); - } - - public void close() throws IOException { - buffer.close(); - } - } - - public static class IndexedWriter extends Writer { - private long index; - - public IndexedWriter(String file) throws IOException { - super(file); - index = 0; - } - - @Override - public void write(String... values) throws IOException { - buffer.write(Long.toString(++index)); - for(int i = 0; i < values.length; i++) { - buffer.write(" " + (values[i] == null ? "\\N" : values[i])); - } - buffer.newLine(); - } - - public long index() { - return index; - } - } - - public static String toString(boolean b) { - return b ? "\1" : "\0"; - } - - public static boolean toBoolean(String b) { - return b.charAt(0) == (char) 1; - } - -} diff --git a/scripts/helper_scripts/parser/src/storage/TabWriter.java b/scripts/helper_scripts/parser/src/storage/TabWriter.java deleted file mode 100755 index 191ff47e..00000000 --- a/scripts/helper_scripts/parser/src/storage/TabWriter.java +++ /dev/null @@ -1,66 +0,0 @@ -package storage; - -import xml.*; - -import java.io.*; -import java.util.stream.Collectors; - -public class TabWriter implements UniprotObserver { - private final BufferedWriter out; - private final boolean verbose; - - public TabWriter( - OutputStream out, - boolean verbose - ) throws IOException { - this.out = new BufferedWriter(new OutputStreamWriter(out)); - this.verbose = verbose; - - // Write header to output file - this.out.write(String.join("\t", new String[]{ - "Entry", - "Sequence", - "Protein names", - "Version (entry)", - "EC number", - "Gene ontology IDs", - "Cross-reference (InterPro)", - "Status", - "Organism ID" - }) + "\n"); - } - - @Override - public void handleEntry(UniprotEntry entry) { - try { - String line = String.join("\t", new String[]{ - entry.getUniprotAccessionNumber(), - entry.getSequence(), - entry.getName(), - String.valueOf(entry.getVersion()), - entry.getECReferences().stream().map(UniprotECRef::getId).collect(Collectors.joining(";")), - entry.getGOReferences().stream().map(UniprotGORef::getId).collect(Collectors.joining(";")), - entry.getInterProReferences().stream().map(UniprotInterProRef::getId).collect(Collectors.joining(";")), - "swissprot", - String.valueOf(entry.getTaxonId()), - }); - - if (verbose) { - System.err.println("INFO VERBOSE: Writing tabular line: " + line); - } - - this.out.write(line + "\n"); - } catch (IOException e) { - System.err.println("Could not write to output stream."); - } - } - - @Override - public void close() { - try { - this.out.close(); - } catch (IOException e) { - System.err.println("Could not correctly close output stream."); - } - } -} diff --git a/scripts/helper_scripts/parser/src/storage/TableWriter.java b/scripts/helper_scripts/parser/src/storage/TableWriter.java deleted file mode 100755 index 98de7a3f..00000000 --- a/scripts/helper_scripts/parser/src/storage/TableWriter.java +++ /dev/null @@ -1,249 +0,0 @@ -package storage; - -import taxons.TaxonList; -import tools.TaxonsUniprots2Tables; -import xml.*; - -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.ArrayList; -import java.util.HashMap; -import java.io.IOException; -import java.io.File; -import java.sql.Timestamp; - - -/** - * Intermediate class to add PeptideData to the database - * - * @author Bart Mesuere - * @author Felix Van der Jeugt - * - */ -public class TableWriter implements UniprotObserver { - - public static final String[] ranks = new String[]{"taxon_id", "superkingdom", "kingdom", "subkingdom", "superphylum", "phylum", "subphylum","superclass", "class", "subclass", "superorder", "order", "suborder", "infraorder", "superfamily", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "species_group", "species_subgroup", "species", "subspecies", "strain", "varietas", "forma"}; - private static final Map rankIndices = new HashMap<>(); - - static { - for(int i = 0; i < ranks.length; i++) { - rankIndices.put(ranks[i], i); - } - } - - private TaxonList taxonList; - private Set wrongTaxonIds; - - // csv files - private CSV.IndexedWriter peptides; - private CSV.IndexedWriter uniprotEntries; - private CSV.IndexedWriter goCrossReferences; - private CSV.IndexedWriter ecCrossReferences; - private CSV.IndexedWriter interProCrossReferences; - - /** - * Creates a new data object - */ - public TableWriter(TaxonsUniprots2Tables args) { - wrongTaxonIds = new HashSet(); - - /* Opening CSV files for writing. */ - try { - taxonList = TaxonList.loadFromFile(args.taxonsFile); - peptides = new CSV.IndexedWriter(args.peptidesFile); - uniprotEntries = new CSV.IndexedWriter(args.uniprotEntriesFile); - ecCrossReferences = new CSV.IndexedWriter(args.ecCrossReferencesFile); - goCrossReferences = new CSV.IndexedWriter(args.goCrossReferencesFile); - interProCrossReferences = new CSV.IndexedWriter(args.interProCrossReferencesFile); - } catch(IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error creating tsv files"); - e.printStackTrace(); - System.exit(1); - } - - } - - /** - * Stores a complete UniprotEntry in the database - * - * @param entry - * the UniprotEntry to store - */ - public void store(UniprotEntry entry) { - long uniprotEntryId = addUniprotEntry(entry.getUniprotAccessionNumber(), entry.getVersion(), - entry.getTaxonId(), entry.getType(), entry.getName(), entry.getSequence()); - if (uniprotEntryId != -1) { // failed to add entry - String faSummary = Stream.of( - entry.getGOReferences().stream().map(UniprotGORef::getId), - entry.getECReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"EC:"+x.getId()), - entry.getInterProReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"IPR:"+x.getId()) - ).flatMap(i -> i).collect(Collectors.joining(";")); - - for(String sequence : entry.digest()) { - addData(sequence.replace('I', 'L'), uniprotEntryId, sequence, faSummary); - } - for (UniprotGORef ref : entry.getGOReferences()) - addGORef(ref, uniprotEntryId); - for (UniprotECRef ref : entry.getECReferences()) - addECRef(ref, uniprotEntryId); - for (UniprotInterProRef ref : entry.getInterProReferences()) - addInterProRef(ref, uniprotEntryId); - } - } - - /** - * - * Inserts the entry info of a uniprot entry into the database and returns - * the generated id. - * - * @param uniprotAccessionNumber - * The accession number of the entry - * @param version - * The version of the entry - * @param taxonId - * The taxonId of the organism of the entry - * @param type - * The type of the entry. Can be swissprot or trembl - * @param sequence - * The full sequence of the peptide. - * @return The database ID of the uniprot entry. - */ - public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxonId, - String type, String name, String sequence) { - if(0 <= taxonId && taxonId < taxonList.size() && taxonList.get(taxonId) != null) { - try { - uniprotEntries.write( - uniprotAccessionNumber, - Integer.toString(version), - Integer.toString(taxonId), - type, - name, - sequence - ); - return uniprotEntries.index(); - } catch(IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error writing to CSV."); - e.printStackTrace(); - } - } else { - if (!wrongTaxonIds.contains(taxonId)) { - wrongTaxonIds.add(taxonId); - System.err.println(new Timestamp(System.currentTimeMillis()) + " " + taxonId - + " added to the list of " + wrongTaxonIds.size() + " invalid taxonIds."); - } - } - return -1; - } - - /** - * Adds peptide data to the database - * - * @param unifiedSequence - * The sequence of the peptide with AA's I and L the - * same. - * @param uniprotEntryId - * The id of the uniprot entry from which the peptide data was - * retrieved. - * @param originalSequence - * The original sequence of the peptide. - * @param functionalAnnotations - * A semicollon separated list of allocated functional analysis terms - */ - public void addData(String unifiedSequence, long uniprotEntryId, String originalSequence, String functionalAnnotations) { - try { - peptides.write( - unifiedSequence, - originalSequence, - Long.toString(uniprotEntryId), - functionalAnnotations - ); - } catch(IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error adding this peptide to the database: " + unifiedSequence); - e.printStackTrace(); - } - } - - /** - * Adds a uniprot entry GO reference to the database - * - * @param ref - * The uniprot GO reference to add - * @param uniprotEntryId - * The uniprotEntry of the cross reference - */ - public void addGORef(UniprotGORef ref, long uniprotEntryId) { - try { - goCrossReferences.write(Long.toString(uniprotEntryId), ref.getId()); - } catch (IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error adding this GO reference to the database."); - e.printStackTrace(); - } - - } - - /** - * Adds a uniprot entry EC reference to the database - * - * @param ref - * The uniprot EC reference to add - * @param uniprotEntryId - * The uniprotEntry of the cross reference - */ - public void addECRef(UniprotECRef ref, long uniprotEntryId) { - try { - ecCrossReferences.write(Long.toString(uniprotEntryId), ref.getId()); - } catch (IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error adding this EC reference to the database."); - e.printStackTrace(); - } - - } - - /** - * Adds a uniprot entry InterPro reference to the database - * - * @param ref - * The uniprot InterPro reference to add - * @param uniprotEntryId - * The uniprotEntry of the cross reference - */ - public void addInterProRef(UniprotInterProRef ref, long uniprotEntryId) { - try { - interProCrossReferences.write(Long.toString(uniprotEntryId), ref.getId()); - } catch (IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Error adding this InterPro reference to the database."); - e.printStackTrace(); - } - } - - @Override - public void handleEntry(UniprotEntry entry) { - store(entry); - } - - @Override - public void close() { - try { - uniprotEntries.close(); - peptides.close(); - goCrossReferences.close(); - ecCrossReferences.close(); - interProCrossReferences.close(); - } catch(IOException e) { - System.err.println(new Timestamp(System.currentTimeMillis()) - + " Something closing the csv files."); - e.printStackTrace(); - } - } - -} diff --git a/scripts/helper_scripts/parser/src/taxons/Taxon.java b/scripts/helper_scripts/parser/src/taxons/Taxon.java deleted file mode 100755 index 43eeb376..00000000 --- a/scripts/helper_scripts/parser/src/taxons/Taxon.java +++ /dev/null @@ -1,55 +0,0 @@ -package taxons; - -import java.util.EnumMap; -import java.util.Map; - -public class Taxon { - - final public String name; - final public Rank rank; - final public int parent; - - public boolean valid; - - public Taxon(String name, Rank rank, int parent) { - this.name = name; - this.rank = rank; - this.parent = parent; - this.valid = true; - } - - public void invalidate() { - this.valid = false; - } - - public boolean valid() { - return this.valid; - } - - - public static enum Rank { - NO_RANK, SUPERKINGDOM, KINGDOM, SUBKINGDOM, SUPERPHYLUM, PHYLUM, SUBPHYLUM, SUPERCLASS, CLASS, SUBCLASS, SUPERORDER, ORDER, SUBORDER, INFRAORDER, SUPERFAMILY, FAMILY, SUBFAMILY, TRIBE, SUBTRIBE, GENUS, SUBGENUS, SPECIES_GROUP, SPECIES_SUBGROUP, SPECIES, SUBSPECIES, STRAIN, VARIETAS, FORMA; - - public static final Rank[] values = Rank.values(); - - private static final Map indices = new EnumMap(Rank.class); - static { - for(int i = 0; i < values.length; i++) { - indices.put(values[i], i); - } - } - - public int index() { - return indices.get(this); - } - - public String toString() { - return this.name().toLowerCase().replace('_', ' '); - } - - public static Rank fromString(String s) { - return valueOf(s.toUpperCase().replace(' ', '_')); - } - } - -} diff --git a/scripts/helper_scripts/parser/src/taxons/TaxonList.java b/scripts/helper_scripts/parser/src/taxons/TaxonList.java deleted file mode 100755 index f0173f2d..00000000 --- a/scripts/helper_scripts/parser/src/taxons/TaxonList.java +++ /dev/null @@ -1,172 +0,0 @@ -package taxons; - -import storage.CSV; - -import java.util.ArrayList; -import java.util.regex.Pattern; -import java.io.FileReader; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.FileNotFoundException; - -public class TaxonList extends ArrayList { - - private static final Pattern PATTERN = Pattern.compile("\\|"); - - public TaxonList() { - super(); - } - - public static TaxonList loadFromFile(String filename) throws IOException { - TaxonList tl = new TaxonList(); - CSV.Reader reader = new CSV.Reader(filename); - String[] row = null; - while((row = reader.read()) != null) { - int id = Integer.parseInt(row[0]); - Taxon t = new Taxon( - row[1], - Taxon.Rank.fromString(row[2]), - Integer.parseInt(row[3]) - ); - if(! CSV.toBoolean(row[4])) t.invalidate(); - while(tl.size() <= id) tl.add(null); - tl.set(id, t); - } - return tl; - } - - public static TaxonList parseDumps(String namesFile, String nodesFile) - throws FileNotFoundException, IOException { - TaxonList tl = new TaxonList(); - BufferedReader names = new BufferedReader(new FileReader(namesFile)); - BufferedReader nodes = new BufferedReader(new FileReader(nodesFile)); - - String nodeline = null; - while((nodeline = nodes.readLine()) != null) { - String[] noderow = PATTERN.split(nodeline); - int taxon_id = Integer.parseInt(noderow[0].trim()); - int parent_id = Integer.parseInt(noderow[1].trim()); - Taxon.Rank rank = Taxon.Rank.fromString(noderow[2].trim()); - - String nameline = null; - String name = null, clas = null; - int taxon_id2 = -1; - while(!"scientific name".equals(clas) && (nameline = names.readLine()) != null) { - String[] namerow = PATTERN.split(nameline); - taxon_id2 = Integer.parseInt(namerow[0].trim()); - name = namerow[1].trim(); - clas = namerow[3].trim(); - } - - if("scientific name".equals(clas) && taxon_id == taxon_id2) { - while(tl.size() <= taxon_id) tl.add(null); - tl.set(taxon_id, new Taxon(name, rank, parent_id)); - } else { - throw new RuntimeException("Taxon " + taxon_id + - " did not have a scientific name."); - } - } - - names.close(); - nodes.close(); - - return tl; - - } - - public void invalidate() { - for(int i = 0; i < size(); i++) validate(i); - } - - private boolean validate(int taxon_id) { - Taxon t = get(taxon_id); - - if(t == null) return false; - - if(! t.valid() - || (t.rank == Taxon.Rank.SPECIES - && ( - (t.name.matches(".*\\d.*") && !t.name.contains("virus")) - || t.name.endsWith(" sp.") - || t.name.endsWith(" genomosp.") - || t.name.contains(" bacterium") - ) - ) - || t.name.contains("enrichment culture") - || t.name.contains("mixed culture") - || t.name.contains("uncultured") - || t.name.contains("unidentified") - || t.name.contains("unspecified") - || t.name.contains("undetermined") - || t.name.contains("sample") - || t.name.endsWith("metagenome") - || t.name.endsWith("library") - || taxon_id == 28384 - || taxon_id == 48479 - || taxon_id == 1869227) { - t.invalidate(); - return false; - } - - if(taxon_id == 1) return true; - - if(! validate(t.parent)) t.invalidate(); - return t.valid(); - } - - public void writeToFile(String filename) throws IOException { - CSV.Writer writer = new CSV.Writer(filename); - for(int i = 0; i < size(); i++) { - Taxon t = get(i); - if(t != null) writer.write(Integer.toString(i), t.name, - t.rank.toString(), Integer.toString(t.parent), - CSV.toString(t.valid())); - } - writer.close(); - } - - public void writeLineagesToFile(String filename) throws IOException { - CSV.Writer writer = new CSV.Writer(filename); - int nranks = Taxon.Rank.values.length; - - for(int i = 0; i < size(); i++) { - Taxon t = get(i); - if(t == null) continue; - - // +1 want - no_rank + lineage_id + taxon_id - String[] lineage = new String[nranks]; - lineage[0] = Integer.toString(i); - - int tid = rankedAncestor(i); - t = get(tid); - boolean valid = t.valid(); - for(int j = nranks - 1; j >= 1; j--) { - if(j > t.rank.index()) { - lineage[j] = valid ? null : "-1"; - } else { - valid = t.valid(); - lineage[j] = Integer.toString((valid ? 1 : -1) * tid); - tid = rankedAncestor(t.parent); - t = get(tid); - } - } - - writer.write(lineage); - } - - writer.close(); - } - - private int rankedAncestor(int tid) { - Taxon t = get(tid); - int pid = -1; - while(t != null && tid != pid && t.rank == Taxon.Rank.NO_RANK) { - pid = tid; - tid = t.parent; - t = get(tid); - } - if(t != null) return tid; - return 1; // only used in case a taxon is no descendant of root - } - -} diff --git a/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java b/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java deleted file mode 100755 index 67cf3630..00000000 --- a/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java +++ /dev/null @@ -1,141 +0,0 @@ -package tools; - -import java.io.*; -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashMap; -import java.util.regex.Pattern; - -public class LineagesSequencesTaxons2LCAs { - - public static final int GENUS = 18; - public static final int SPECIES = 22; - public static final int RANKS = 27; - private static final Pattern SEPARATOR = Pattern.compile("\t"); - private static final String NULL = "\\N"; - private int[][] taxonomy; - private final Writer writer; - - public LineagesSequencesTaxons2LCAs(String taxonomyFile) throws IOException { - writer = new BufferedWriter(new OutputStreamWriter(System.out, "utf-8")); - buildTaxonomy(taxonomyFile); - } - - private void buildTaxonomy(String file) throws FileNotFoundException, IOException { - HashMap taxonomyMap = new HashMap<>(); - InputStream is = new FileInputStream(new File(file)); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - - br.lines() - .forEach(line -> { - String[] elements = SEPARATOR.split(line, 28); - - int key = Integer.parseInt(elements[0]); - int[] lineage = Arrays.stream(elements) - .skip(1)// skip taxonId - .mapToInt(s -> s.toUpperCase().equals("\\N") ? 0 : Integer.parseInt(s)) - .toArray(); - - taxonomyMap.put(key, lineage); - }); - - int max = taxonomyMap.keySet().stream().max(Integer::compare).get(); - taxonomy = new int[max + 1][]; - taxonomyMap.keySet().stream().forEach(key -> taxonomy[key] = taxonomyMap.get(key)); - } - - public void calculateLCAs() throws IOException { - BufferedReader br = new BufferedReader(new InputStreamReader(System.in), 67108864); - - int count = 0; - String currentSequence = null; - Collection taxa = new ArrayList<>(); - String line; - while ((line = br.readLine()) != null) { - count++; - if (count % 10000000 == 0) { - System.err.println(new Timestamp(System.currentTimeMillis()) + ": " + count); - } - - // outperforms split by at least 20% - int t = line.indexOf('\t'); - String sequence = line.substring(0, t); - int taxonId = Integer.parseInt(line.substring(t + 1)); - - if (currentSequence == null || !currentSequence.equals(sequence)) { - if (currentSequence != null) { - handleLCA(currentSequence, calculateLCA(taxa)); - } - - currentSequence = sequence; - taxa.clear(); - } - - taxa.add(taxonId); - } - handleLCA(currentSequence, calculateLCA(taxa)); - } - - private int calculateLCA(Collection taxa) { - int lca = 1; - int[][] lineages = taxa.stream() - .map(t -> taxonomy[t]) - .filter(l -> l != null) - .toArray(int[][]::new); - for (int rank = 0; rank < RANKS; rank++) { - final int finalRank = rank; - final int[] val = {-1}; - boolean allMatch = Arrays.stream(lineages) - .mapToInt(l -> l[finalRank]) - .filter(i -> finalRank == GENUS || finalRank == SPECIES ? i > 0 : i >= 0) - .peek(i -> val[0] = val[0] == -1 ? i : val[0]) - .allMatch(i -> i == val[0]); - - if (val[0] != -1) { - if (!allMatch) { - break; - } - if (val[0] != 0) { - lca = val[0]; - } - } - } - return lca; - } - - private void handleLCA(String sequence, int lca) { - try { - writer.write(sequence + "\t" + lca + '\n'); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void close() throws IOException { - writer.close(); - } - - /** - * first argument should be the lineages in tsv format without a header row. Create by running: - * $ echo "select * from lineages;" | mysql -u unipept -p unipept | sed 1d > lineages.tsv - *

- * standard input should be the peptides in tsv format with a header row. Create by running: - * $ echo "select sequence_id, taxon_id from peptides left join uniprot_entries on peptides.uniprot_entry_id = uniprot_entries.id;" | \n - * mysql -u unipept -p unipept -q | sort -S 50% --parallel=12 -k1n > sequences.tsv - * - * @param args - */ - public static void main(String... args) { - try { - System.err.println(new Timestamp(System.currentTimeMillis()) + ": reading taxonomy"); - LineagesSequencesTaxons2LCAs l = new LineagesSequencesTaxons2LCAs(args[0]); - System.err.println(new Timestamp(System.currentTimeMillis()) + ": reading sequences"); - l.calculateLCAs(); - l.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java b/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java deleted file mode 100755 index 06a72e56..00000000 --- a/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java +++ /dev/null @@ -1,36 +0,0 @@ -package tools; - -import java.io.IOException; -import java.io.FileNotFoundException; - -import com.beust.jcommander.Parameter; -import com.beust.jcommander.JCommander; -import taxons.TaxonList; - -public class NamesNodes2TaxonsLineages { - - @Parameter(names="--names", description="Taxon names input file") public String namesFile; - @Parameter(names="--nodes", description="Taxon nodes input file") public String nodesFile; - @Parameter(names="--taxons", description="Taxon TSV output file") public String taxonsFile; - @Parameter(names="--lineages", description="Lineages TSV output file") public String lineagesFile; - - /** - * Parse a list of taxons and their lineages from the NCBI dumps. - * - * This program will parse the first two argument files, and create the next - * two. The first two arguments are the nodes.dmp and names.dmp files - * downloaded from the NCBI. TSV-dumps of the parsed taxons and lineages - * will be written to the third and fourth parameter. - */ - public static void main(String[] args) throws IOException { - NamesNodes2TaxonsLineages main = new NamesNodes2TaxonsLineages(); - new JCommander(main, args); - - TaxonList tl = TaxonList.parseDumps(main.namesFile, main.nodesFile); - tl.invalidate(); - tl.writeToFile(main.taxonsFile); - tl.writeLineagesToFile(main.lineagesFile); - } - -} - diff --git a/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java b/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java deleted file mode 100644 index 14763ee7..00000000 --- a/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java +++ /dev/null @@ -1,49 +0,0 @@ -package tools; - -import java.io.IOException; - -import com.beust.jcommander.Parameter; -import com.beust.jcommander.JCommander; - -import storage.TableWriter; -import tsv.UniprotTabParser; - -public class TaxonsUniprots2Tables { - - @Parameter(names="--peptide-min", description="Minimum peptide length") public int peptideMin; - @Parameter(names="--peptide-max", description="Maximum peptide length") public int peptideMax; - @Parameter(names="--taxons", description="Taxons TSV input file") public String taxonsFile; - @Parameter(names="--peptides", description="Peptides TSV output file") public String peptidesFile; - @Parameter(names="--uniprot-entries", description="Uniprot entries TSV output file") public String uniprotEntriesFile; - @Parameter(names="--ec", description="EC references TSV output file") public String ecCrossReferencesFile; - @Parameter(names="--go", description="GO references TSV output file") public String goCrossReferencesFile; - @Parameter(names="--interpro", description="InterPro references TSV output file") public String interProCrossReferencesFile; - @Parameter(names="--verbose", description="Enable verbose mode") public boolean verboseMode; - - /** - * Parse the UniProt TSV-file into TSV tables. - * - * The first parameter is a taxon file, as written by NamesNodes2Taxons. The next 5 parameters are the output files, - * all in TSV format. In order, they are: the peptides, the uniprot entries, the EC cross references, the GO cross - * references and the InterPro cross references. - * - * This program reads input from stdin and writes output to the files indicated by the parameters given above. - */ - public static void main(String[] args) throws IOException { - TaxonsUniprots2Tables main = new TaxonsUniprots2Tables(); - new JCommander(main, args); - - if (main.verboseMode) { - System.err.println("INFO: TaxonsUniprots2Tables - Verbose mode enabled."); - } - - TableWriter writer = new TableWriter(main); - - UniprotTabParser parser = new UniprotTabParser(); - parser.parse(main.peptideMin, main.peptideMax, System.in, writer, main.verboseMode); - - writer.close(); - } - -} - diff --git a/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java b/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java deleted file mode 100755 index ab045f19..00000000 --- a/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java +++ /dev/null @@ -1,35 +0,0 @@ -package tools; - -import org.xml.sax.SAXException; -import storage.TabWriter; -import xml.UniprotHandler; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.*; - -/** - * This tool accepts 3 different arguments: - * peptide_min_length, peptide_max_length, database_type_name - * - * The input is read from stdin and the output of this script is written to stdout. - * - * This tool's job is to produce a TSV-file with the same contents as the XML-file that's fed into this script. - */ -public class XmlToTabConverter { - public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException { - SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); - - InputStream uniprotStream = System.in; - UniprotHandler handler = new UniprotHandler(Integer.parseInt(args[0]), Integer.parseInt(args[1]), args[2]); - - TabWriter writer = new TabWriter(System.out, Boolean.parseBoolean(args[3])); - handler.addObserver(writer); - - parser.parse(uniprotStream, handler); - - uniprotStream.close(); - writer.close(); - } -} diff --git a/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java b/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java deleted file mode 100755 index 0d057cb8..00000000 --- a/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java +++ /dev/null @@ -1,77 +0,0 @@ -package tsv; - -import xml.*; - -import java.io.*; -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class UniprotTabParser { - public void parse( - int peptideMinLength, - int peptideMaxLength, - InputStream input, - UniprotObserver observer, - boolean verbose - ) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(input)); - - String line = reader.readLine().trim(); - String[] header = Stream.of(line.split("\t")).map(String::trim).toArray(String[]::new); - - Map headerMap = new HashMap(); - for (int i = 0; i < header.length; i++) { - headerMap.put(header[i], i); - } - - line = reader.readLine(); - - while (line != null) { - if (verbose) { - System.err.println("INFO VERBOSE: TSV line parsed: " + line); - } - - String[] fields = line.trim().split("\t"); - - try { - // We need to emit one new UniprotEntry per line in the input - UniprotEntry entry = new UniprotEntry(fields[headerMap.get("Status")].trim(), peptideMinLength, peptideMaxLength); - - // Now convert all fields into the correct Uniprot entry properties - entry.setUniprotAccessionNumber(fields[headerMap.get("Entry")]); - entry.setSequence(fields[headerMap.get("Sequence")].trim()); - - entry.setRecommendedName(fields[headerMap.get("Protein names")].trim()); - // Todo, does not always need to be set? - // entry.setSubmittedName("name"); - - entry.setVersion(Integer.parseInt(fields[headerMap.get("Version (entry)")].trim())); - - for (String ecNumber : fields[headerMap.get("EC number")].split(";")) { - entry.addECRef(new UniprotECRef(ecNumber.trim())); - } - - for (String goTerm : fields[headerMap.get("Gene ontology IDs")].split(";")) { - entry.addGORef(new UniprotGORef(goTerm.trim())); - } - - for (String interpro : fields[headerMap.get("Cross-reference (InterPro)")].split(";")) { - entry.addInterProRef(new UniprotInterProRef(interpro.trim())); - } - - entry.setTaxonId(Integer.parseInt(fields[headerMap.get("Organism ID")])); - - // Emit entry that's finished and handle it... - observer.handleEntry(entry); - } catch (Exception e) { - System.err.println("Invalid entry ignored: " + line); - System.err.println("Invalid entry error details: " + e.getMessage()); - } - - line = reader.readLine(); - } - - reader.close(); - } -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java b/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java deleted file mode 100755 index cdbd58cd..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java +++ /dev/null @@ -1,39 +0,0 @@ -package xml; - -public class UniprotDbRef { - - private String type; - private String sequenceId; - private String proteinId; - - public UniprotDbRef(String type, String sequenceId, String proteinId) { - this.type = type; - this.sequenceId = sequenceId; - this.proteinId = proteinId; - } - - public UniprotDbRef(String type) { - this.type = type; - } - - public String getType() { - return type; - } - - public String getSequenceId() { - return sequenceId; - } - - public void setSequenceId(String sequenceId) { - this.sequenceId = sequenceId; - } - - public String getProteinId() { - return proteinId; - } - - public void setProteinId(String proteinId) { - this.proteinId = proteinId; - } - -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotECRef.java b/scripts/helper_scripts/parser/src/xml/UniprotECRef.java deleted file mode 100755 index 4adba41f..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotECRef.java +++ /dev/null @@ -1,14 +0,0 @@ -package xml; - -public class UniprotECRef { - - private String id; - - public UniprotECRef(String id) { - this.id = id; - } - - public String getId() { - return id; - } -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotEntry.java b/scripts/helper_scripts/parser/src/xml/UniprotEntry.java deleted file mode 100755 index 870442da..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotEntry.java +++ /dev/null @@ -1,162 +0,0 @@ -package xml; - -import java.util.Arrays; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Stream; - -/** - * @author Bart Mesuere - * - */ -public class UniprotEntry { - - // peptide settings - private final int peptideMin; - private final int peptideMax; - - private String uniprotAccessionNumber; - private int version; - private int taxonId; - private String type; - private String recommendedName; - private String submittedName; - private String sequence; - private List dbReferences; - private List goReferences; - private List ecReferences; - private List interProReferences; - private List sequences; - - public UniprotEntry(String type, int peptideMin, int peptideMax) { - this.type = type; - this.peptideMin = peptideMin; - this.peptideMax = peptideMax; - dbReferences = new ArrayList(); - goReferences = new ArrayList(); - ecReferences = new ArrayList(); - interProReferences = new ArrayList(); - sequences = new ArrayList(); - } - - public void reset(String type) { - uniprotAccessionNumber = null; - version = 0; - taxonId = 0; - this.type = type; - recommendedName = null; - submittedName = null; - sequence = null; - dbReferences.clear(); - goReferences.clear(); - ecReferences.clear(); - interProReferences.clear(); - sequences.clear(); - } - - public String getUniprotAccessionNumber() { - return uniprotAccessionNumber; - } - - public void setUniprotAccessionNumber(String uniprotAccessionNumber) { - if(this.uniprotAccessionNumber == null) { - this.uniprotAccessionNumber = uniprotAccessionNumber; - } - } - - public int getVersion() { - return version; - } - - public void setVersion(int version) { - this.version = version; - } - - public int getTaxonId() { - return taxonId; - } - - public void setTaxonId(int taxonId) { - this.taxonId = taxonId; - } - - public String getType() { - return type; - } - - public String getName() { - if(recommendedName != null) return recommendedName; - return submittedName; - } - - public void setRecommendedName(String name) { - recommendedName = name; - } - - public void setSubmittedName(String name) { - submittedName = name; - } - - public String getSequence() { - return sequence; - } - - public void setSequence(String sequence) { - this.sequence = sequence.replace(" ", ""); - } - - public void addDbRef(UniprotDbRef ref) { - dbReferences.add(ref); - } - - public void addGORef(UniprotGORef ref) { - goReferences.add(ref); - } - - public void addECRef(UniprotECRef ref) { - ecReferences.add(ref); - } - - public void addInterProRef(UniprotInterProRef ref) { interProReferences.add(ref); } - - public List digest() { - sequences.clear(); - int start = 0; - int length = sequence.length(); - for (int i = 0; i < length; i++) { - char x = sequence.charAt(i); - if ((x == 'K' || x == 'R') && (i + 1 < length && sequence.charAt(i + 1) != 'P')) { - if (i + 1 - start >= peptideMin && i + 1 - start <= peptideMax) { - sequences.add(sequence.substring(start, i + 1)); - } - start = i + 1; - } - } - if (length - start >= peptideMin && length - start <= peptideMax) { - sequences.add(sequence.substring(start, length)); - } - return sequences; - } - - public List getDbReferences() { - return dbReferences; - } - - public List getGOReferences() { - return goReferences; - } - - public List getECReferences() { - return ecReferences; - } - - public List getInterProReferences(){ return interProReferences; } - - - @Override - public String toString() { - return uniprotAccessionNumber + ", " + version + ", " + taxonId + ", " + type + ", " - + sequence; - } - -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotGORef.java b/scripts/helper_scripts/parser/src/xml/UniprotGORef.java deleted file mode 100755 index d1c909f1..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotGORef.java +++ /dev/null @@ -1,14 +0,0 @@ -package xml; - -public class UniprotGORef { - - private String id; - - public UniprotGORef(String id) { - this.id = id; - } - - public String getId() { - return id; - } -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotHandler.java b/scripts/helper_scripts/parser/src/xml/UniprotHandler.java deleted file mode 100755 index 8164d162..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotHandler.java +++ /dev/null @@ -1,249 +0,0 @@ -package xml; - -import java.sql.Timestamp; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -public class UniprotHandler extends DefaultHandler { - - private final String uniprotType; - - private UniprotEntry currentItem; - private UniprotDbRef dbRef; - private UniprotGORef goRef; - private UniprotECRef ecRef; - private UniprotInterProRef interProRef; - private StringBuilder charData; - private int i; - private boolean inComment = false; - private boolean inOrganism = false; - private boolean inEvidence = false; - private boolean inRecommendedName = false; - private boolean inSubmittedName = false; - private List observers; - - private Map endTagWorkers; - private Map startTagWorkers; - - public UniprotHandler(int peptideMinLength, int peptideMaxLength, String uniprotType) { - super(); - this.uniprotType = uniprotType; - currentItem = new UniprotEntry(uniprotType, peptideMinLength, peptideMaxLength); - charData = new StringBuilder(); - observers = new ArrayList(); - - // set up end tag workers - endTagWorkers = new HashMap(); - endTagWorkers.put("entry", new EndTagWorker() { - @Override - public void handleTag(String data) { - emitEntry(currentItem); - } - }); - endTagWorkers.put("accession", new EndTagWorker() { - @Override - public void handleTag(String data) { - currentItem.setUniprotAccessionNumber(data); - } - }); - endTagWorkers.put("organism", new EndTagWorker() { - @Override - public void handleTag(String data) { - inOrganism = false; - } - }); - endTagWorkers.put("evidence", new EndTagWorker() { - @Override - public void handleTag(String data) { - inEvidence = false; - } - }); - endTagWorkers.put("recommendedName", new EndTagWorker() { - @Override - public void handleTag(String data) { - inRecommendedName = false; - } - }); - endTagWorkers.put("submittedName", new EndTagWorker() { - @Override - public void handleTag(String data) { - inSubmittedName = false; - } - }); - endTagWorkers.put("sequence", new EndTagWorker() { - @Override - public void handleTag(String data) { - currentItem.setSequence(data); - } - }); - endTagWorkers.put("dbReference", new EndTagWorker() { - @Override - public void handleTag(String data) { - if (inComment) { - return; - } - - if (!inOrganism) { - if (dbRef != null) { - currentItem.addDbRef(dbRef); - dbRef = null; - } else if (goRef != null) { - currentItem.addGORef(goRef); - goRef = null; - } else if (ecRef != null) { - currentItem.addECRef(ecRef); - ecRef = null; - } else if (interProRef != null) { - currentItem.addInterProRef(interProRef); - interProRef = null; - } - } - } - }); - endTagWorkers.put("fullName", new EndTagWorker() { - @Override - public void handleTag(String data) { - if (inRecommendedName) { - currentItem.setRecommendedName(data); - } else if (inSubmittedName) { - currentItem.setSubmittedName(data); - } - } - }); - endTagWorkers.put("comment", new EndTagWorker() { - @Override - public void handleTag(String data) { - inComment = false; - } - }); - - // set up start tag workers - startTagWorkers = new HashMap(); - startTagWorkers.put("entry", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - newCurrentItem(); - currentItem.setVersion(Integer.valueOf(atts.getValue("version"))); - } - }); - startTagWorkers.put("organism", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - inOrganism = true; - } - }); - startTagWorkers.put("evidence", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - inEvidence = true; - } - }); - startTagWorkers.put("recommendedName", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - inRecommendedName = true; - } - }); - startTagWorkers.put("submittedName", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - inSubmittedName = true; - } - }); - startTagWorkers.put("dbReference", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - // Skip references if they are embedded in comments (otherwise, these could cause duplicate identifiers) - if (inComment) { - return; - } - - if (inOrganism) { - if (atts.getValue("type").equals("NCBI Taxonomy")) - currentItem.setTaxonId(Integer.valueOf(atts.getValue("id"))); - } else if (!inEvidence) { - if (atts.getValue("type").equals("EMBL")) { - dbRef = new UniprotDbRef("EMBL"); - dbRef.setSequenceId(atts.getValue("id")); - } else if (atts.getValue("type").equals("RefSeq")) { - dbRef = new UniprotDbRef("RefSeq"); - dbRef.setProteinId(atts.getValue("id")); - } else if (atts.getValue("type").equals("GO")) { - goRef = new UniprotGORef(atts.getValue("id")); - } else if (atts.getValue("type").equals("EC")) { - ecRef = new UniprotECRef(atts.getValue("id")); - } else if (atts.getValue("type").equals("InterPro")) { - interProRef = new UniprotInterProRef(atts.getValue("id")); - } - } - } - }); - startTagWorkers.put("property", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - if (dbRef != null) { - if (atts.getValue("type").equals("protein sequence ID")) - dbRef.setProteinId(atts.getValue("value")); - else if (atts.getValue("type").equals("nucleotide sequence ID")) - dbRef.setSequenceId(atts.getValue("value")); - } - } - }); - startTagWorkers.put("comment", new StartTagWorker() { - @Override - public void handleTag(Attributes atts) { - inComment = true; - } - }); - } - - @Override - public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { - StartTagWorker worker = startTagWorkers.get(qName); - if (worker != null) { - worker.handleTag(atts); - } - } - - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - EndTagWorker worker = endTagWorkers.get(qName); - if (worker != null) { - worker.handleTag(charData.toString().trim()); - } - charData.delete(0, charData.length()); - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - charData.append(ch, start, length); - } - - private void newCurrentItem() { - currentItem.reset(uniprotType); - } - - private interface StartTagWorker { - void handleTag(Attributes att); - } - - private interface EndTagWorker { - void handleTag(String data); - } - - public void addObserver(UniprotObserver o) { - observers.add(o); - } - - private void emitEntry(UniprotEntry entry) { - for (UniprotObserver o : observers) { - o.handleEntry(entry); - } - } -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java b/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java deleted file mode 100755 index 87211e91..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java +++ /dev/null @@ -1,14 +0,0 @@ -package xml; - -public class UniprotInterProRef { - - private String id; - - public UniprotInterProRef(String id) { - this.id = id; - } - - public String getId() { - return id; - } -} diff --git a/scripts/helper_scripts/parser/src/xml/UniprotObserver.java b/scripts/helper_scripts/parser/src/xml/UniprotObserver.java deleted file mode 100755 index 45bf4bc2..00000000 --- a/scripts/helper_scripts/parser/src/xml/UniprotObserver.java +++ /dev/null @@ -1,8 +0,0 @@ -package xml; - -import xml.UniprotEntry; - -public interface UniprotObserver { - public void handleEntry(UniprotEntry entry); - public void close(); -} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs index 007716d3..7e9e3c6b 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs @@ -2,7 +2,6 @@ use anyhow::{Context, Result}; use clap::Parser; use unipept_database::dat_parser::uniprot_dat_parser; use unipept_database::dat_parser::utils::write_header; -use unipept_database::uniprot::UniprotType; use unipept_database::utils::files::open_sin; @@ -24,8 +23,8 @@ fn main() -> Result<()> { #[derive(Parser, Debug)] struct Cli { - #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)] - db_type: UniprotType, + #[clap(short = 't', long, default_value = "swissprot")] + db_type: String, #[clap(long, default_value_t = 0)] threads: usize, } diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs index 64a20225..a19958ff 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs @@ -31,7 +31,7 @@ fn main() -> Result<()> { all_taxa.push(taxa_id); } - let chunk_file_regex = Regex::new(r"unipept\..*\.gz").context("Error creating regex")?; + let chunk_file_regex = Regex::new(r"unipept\..*\.lz4").context("Error creating regex")?; for entry in read_dir(&args.chunk_dir).context("Error reading chunk directory")? { let entry = entry.context("Error reading entry from chunk directory")?; @@ -52,7 +52,7 @@ fn main() -> Result<()> { } // Parse the taxa range out of the filename - let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", ""); + let replaced_name = base_name.replace("unipept.", "").replace(".chunk.lz4", ""); let range = replaced_name.split_once('-'); let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?; let start: u64 = range diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs index 9e102cb6..b10ae097 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs @@ -4,7 +4,6 @@ use std::num::NonZeroUsize; use anyhow::{Context, Result}; use clap::Parser; use smartstring::{LazyCompact, SmartString}; -use unipept_database::uniprot::UniprotType; use uniprot::uniprot::{SequentialParser, ThreadedParser}; use unipept_database::utils::files::open_sin; @@ -50,8 +49,8 @@ type SmartStr = SmartString; // Parse a Uniprot XML file and convert it into a TSV-file #[derive(Parser, Debug)] struct Cli { - #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)] - uniprot_type: UniprotType, + #[clap(short = 't', long, default_value = "swissprot")] + uniprot_type: String, #[clap(long, default_value_t = 0)] threads: u32, #[clap(short, long, default_value_t = false)] @@ -123,7 +122,7 @@ fn parse_name(entry: &uniprot::uniprot::Entry) -> SmartStr { } /// Write a single UniProt entry to stdout -fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: bool) { +fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &str, verbose: bool) { let accession_number: SmartStr = entry.accessions[0].clone(); let sequence: SmartStr = entry.sequence.value.clone(); @@ -165,7 +164,7 @@ fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: SmartStr::from(ec_references.join(";")), SmartStr::from(go_references.join(";")), SmartStr::from(ip_references.join(";")), - SmartStr::from(db_type.to_str()), + SmartStr::from(db_type), taxon_id, ]; diff --git a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs index 4d22fd78..0c2745a6 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs @@ -1,8 +1,5 @@ -use std::collections::HashSet; - use anyhow::Context; - -use crate::uniprot::UniprotType; +use std::collections::HashSet; // Constants to aid in parsing const COMMON_PREFIX_LEN: usize = "ID ".len(); @@ -49,7 +46,7 @@ impl UniProtDATEntry { } /// Write an entry to stdout - pub fn write(&self, db_type: &UniprotType) { + pub fn write(&self, db_type: &str) { if self.name.is_empty() { eprintln!( "Could not find a name for entry AC-{}", @@ -66,7 +63,7 @@ impl UniProtDATEntry { self.ec_references.join(";"), self.go_references.join(";"), self.ip_references.join(";"), - db_type.to_str(), + db_type, self.taxon_id ) } diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs index 497dd646..9b309f52 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs @@ -2,5 +2,4 @@ pub mod calculate_lcas; pub mod dat_parser; pub mod taxons_lineages; pub mod taxons_uniprots_tables; -pub mod uniprot; pub mod utils; diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs index ec9dd1ab..d92b9a15 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs @@ -112,6 +112,7 @@ impl TableWriter { id, sequence, &summary, + entry.taxon_id, ) .context("Failed to write peptide")?; } @@ -125,17 +126,19 @@ impl TableWriter { id: i64, original_sequence: &[u8], annotations: &String, + taxon_id: i32, ) -> Result<()> { self.peptide_count += 1; writeln!( &mut self.peptides, - "{}\t{}\t{}\t{}\t{}", + "{}\t{}\t{}\t{}\t{}\t{}", self.peptide_count, String::from_utf8_lossy(&sequence), String::from_utf8_lossy(original_sequence), id, - annotations + annotations, + taxon_id ) .context("Error writing to TSV")?; diff --git a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs deleted file mode 100644 index ae293ece..00000000 --- a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -/// Enum for the different kinds of databases -#[derive(clap::ValueEnum, Clone, Debug)] -pub enum UniprotType { - Swissprot, - Trembl, -} - -impl UniprotType { - pub fn to_str(&self) -> &str { - match self { - UniprotType::Swissprot => "swissprot", - UniprotType::Trembl => "trembl", - } - } -} diff --git a/scripts/parallel_load.sh b/scripts/parallel_load.sh index 88224212..fcbfc571 100755 --- a/scripts/parallel_load.sh +++ b/scripts/parallel_load.sh @@ -1,5 +1,4 @@ shopt -s expand_aliases -alias zcat="pigz -cd" export db=unipept export user=root @@ -9,16 +8,16 @@ dir="$1" function load_table() { file=$1 - tbl=`echo $file | sed "s/.tsv.gz//"` - echo "zcatting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl" - zcat $file | mariadb --local-infile=1 -u$user -p$pass $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1 + tbl=`echo $file | sed "s/.tsv.lz4//"` + echo "lz4catting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl" + lz4 -dc $file | mysql --local-infile=1 -u$user -p$pass $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1 } export -f load_table cd "$dir" -parallel load_table ::: *.tsv.gz +parallel load_table ::: *.tsv.lz4 cd "-"