diff --git a/.gitignore b/.gitignore
index 01de547a..c6161e94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,3 @@ out
scripts/helper_scripts/parser/output
scripts/helper_scripts/parser/src/META-INF
.idea/
-
diff --git a/scripts/build_binaries.sh b/scripts/build_binaries.sh
new file mode 100755
index 00000000..adfb5c93
--- /dev/null
+++ b/scripts/build_binaries.sh
@@ -0,0 +1,19 @@
+#! /usr/bin/env bash
+
+# All references to an external script should be relative to the location of this script.
+# See: http://mywiki.wooledge.org/BashFAQ/028
+CURRENT_LOCATION="${BASH_SOURCE%/*}"
+
+checkdep() {
+ which $1 > /dev/null 2>&1 || hash $1 > /dev/null 2>&1 || {
+ echo "Unipept database builder requires ${2:-$1} to be installed." >&2
+ exit 1
+ }
+}
+
+checkdep cargo "Rust toolchain"
+
+# Build binaries and copy them to the /helper_scripts folder
+cd $CURRENT_LOCATION/helper_scripts/unipept-database-rs
+cargo build --release
+find ./target/release -maxdepth 1 -type f -executable -exec cp {} .. \;
diff --git a/scripts/build_database.sh b/scripts/build_database.sh
index a44d13a0..08f40152 100755
--- a/scripts/build_database.sh
+++ b/scripts/build_database.sh
@@ -36,7 +36,7 @@ Required parameters:
- swissprot: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
- trembl: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz
- * OUTPUT_DIR: Directory in which the tsv.gz-files that are produced by this script will be stored.
+ * OUTPUT_DIR: Directory in which the tsv.lz4-files that are produced by this script will be stored.
Options:
* -h
@@ -71,14 +71,12 @@ Dependencies:
This script requires some non-standard dependencies to be installed before it can be used. This is a list of these
items (which can normally be installed through your package manager):
- * maven
- * node-js
* curl
* pv
* pigz
- * java
* uuidgen
* parallel
+ * lz4
END
}
@@ -245,12 +243,10 @@ checkDirectoryAndCreate "$4"
### Check that all dependencies required for this script to function are met.
checkdep curl
-checkdep java
-checkdep mvn "Maven"
checkdep uuidgen
checkdep pv
-checkdep node
checkdep pigz
+checkdep lz4
### Default configuration for this script
PEPTIDE_MIN_LENGTH=5 # What is the minimum length (inclusive) for tryptic peptides?"
@@ -258,9 +254,11 @@ PEPTIDE_MAX_LENGTH=50 # What is the maximum length (inclusive) for tryptic pepti
TABDIR="$OUTPUT_DIR" # Where should I store the final TSV files (large, single-write)?
INTDIR="$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT" # Where should I store intermediate TSV files (large, single-write, multiple-read?
KMER_LENGTH=9 # What is the length (k) of the K-mer peptides?
-JAVA_MEM="2g" # How much memory should Java use?
CMD_SORT="sort --buffer-size=$SORT_MEMORY --parallel=4" # Which sort command should I use?
-CMD_GZIP="gzip -" # Which pipe compression command should I use?
+CMD_GZIP="pigz -" # Which pipe compression command should I use for .gz files?
+CMD_ZCAT="pigz -dc" # Which decompression command should I use for .gz files?
+CMD_LZ4="lz4 -c" # Which pipe compression command should I use for .lz4 files?
+CMD_LZ4CAT="lz4 -dc" # Which decompression command should I use for .lz4 files?
ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez?
TAXON_FALLBACK_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
@@ -301,7 +299,22 @@ guz() {
fifo="$(uuidgen)-$(basename "$1")"
mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
- { zcat "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
+ { $CMD_ZCAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
+}
+
+lz() {
+ fifo="$(uuidgen)-$(basename "$1")"
+ mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
+ echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
+ mkdir -p "$(dirname "$1")"
+ { $CMD_LZ4 - < "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" > "$1" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
+}
+
+luz() {
+ fifo="$(uuidgen)-$(basename "$1")"
+ mkfifo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
+ echo "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo"
+ { $CMD_LZ4CAT "$1" > "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" && rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/$fifo" || kill "$self"; } > /dev/null &
}
have() {
@@ -350,10 +363,10 @@ create_taxon_tables() {
-e 's/parvorder/no rank/' "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp"
mkdir -p "$OUTPUT_DIR"
- java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/NamesNodes2TaxonsLineages.jar" \
+ $CURRENT_LOCATION/helper_scripts/taxons-lineages \
--names "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/names.dmp" --nodes "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp" \
- --taxons "$(gz "$OUTPUT_DIR/taxons.tsv.gz")" \
- --lineages "$(gz "$OUTPUT_DIR/lineages.tsv.gz")"
+ --taxons "$(lz "$OUTPUT_DIR/taxons.tsv.lz4")" \
+ --lineages "$(lz "$OUTPUT_DIR/lineages.tsv.lz4")"
rm "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/names.dmp" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/nodes.dmp"
log "Finished creating the taxon tables."
@@ -368,7 +381,8 @@ download_and_convert_all_sources() {
DB_TYPES_ARRAY=($DB_TYPES)
DB_SOURCES_ARRAY=($DB_SOURCES)
- IFS="$OLDIFS"
+ # Set IFS to newline to properly split the $CHUNKS variable for folders with newlines
+ IFS=$'\n'
while [[ "$IDX" -ne "${#DB_TYPES_ARRAY}" ]] && [[ -n $(echo "${DB_TYPES_ARRAY[$IDX]}" | sed "s/\s//g") ]]
do
@@ -396,7 +410,7 @@ download_and_convert_all_sources() {
reportProgress -1 "Downloading database index for $DB_TYPE." 3
- curl --continue-at - --create-dirs "$DB_SOURCE" --silent | zcat | java -jar "$CURRENT_LOCATION/helper_scripts/XmlToTabConverter.jar" 5 50 "$DB_TYPE" "$VERBOSE" | node "$CURRENT_LOCATION/helper_scripts/WriteToChunk.js" "$DB_INDEX_OUTPUT" "$VERBOSE"
+ curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
# Now, compress the different chunks
CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
@@ -407,7 +421,7 @@ download_and_convert_all_sources() {
for CHUNK in $CHUNKS
do
echo "Compressing $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE"
- pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | pigz > "$CHUNK.gz"
+ pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | lz4 -c > "$CHUNK.lz4"
# Remove the chunk that was just compressed
rm "$CHUNK"
CHUNK_IDX=$((CHUNK_IDX + 1))
@@ -440,7 +454,7 @@ download_and_convert_all_sources() {
SIZE="$(curl -I "$DB_SOURCE" -s | grep -i content-length | tr -cd '[0-9]')"
- curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | zcat | java -jar "$CURRENT_LOCATION/helper_scripts/XmlToTabConverter.jar" 5 50 "$DB_TYPE" "$VERBOSE" | node "$CURRENT_LOCATION/helper_scripts/WriteToChunk.js" "$DB_INDEX_OUTPUT" "$VERBOSE"
+ curl --continue-at - --create-dirs "$DB_SOURCE" --silent | pv -i 5 -n -s "$SIZE" 2> >(reportProgress - "Downloading database index for $DB_TYPE." 3 >&2) | $CMD_ZCAT | $CURRENT_LOCATION/helper_scripts/xml-parser -t "$DB_TYPE" | $CURRENT_LOCATION/helper_scripts/write-to-chunk --output-dir "$DB_INDEX_OUTPUT"
# Now, compress the different chunks
CHUNKS=$(find "$DB_INDEX_OUTPUT" -name "*.chunk")
@@ -451,7 +465,7 @@ download_and_convert_all_sources() {
for CHUNK in $CHUNKS
do
echo "Compressing $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE"
- pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | pigz > "$CHUNK.gz"
+ pv -i 5 -n "$CHUNK" 2> >(reportProgress - "Processing chunk $CHUNK_IDX of $TOTAL_CHUNKS for $DB_TYPE index." 4 >&2) | lz4 -c > "$CHUNK.lz4"
# Remove the chunk that was just compressed
rm "$CHUNK"
CHUNK_IDX=$((CHUNK_IDX + 1))
@@ -465,6 +479,8 @@ download_and_convert_all_sources() {
IDX=$((IDX + 1))
done
+
+ IFS="$OLDIFS"
}
filter_sources_by_taxa() {
@@ -491,176 +507,142 @@ filter_sources_by_taxa() {
mkdir -p "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter"
- $CURRENT_LOCATION/helper_scripts/filter_taxa.sh "$TAXA" "$DB_INDEX_OUTPUT" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter" "$OUTPUT_DIR/lineages.tsv.gz"
+ $CURRENT_LOCATION/helper_scripts/filter_taxa.sh "$TAXA" "$DB_INDEX_OUTPUT" "$TEMP_DIR/$UNIPEPT_TEMP_CONSTANT/filter" "$OUTPUT_DIR/lineages.tsv.lz4"
IDX=$((IDX + 1))
done
}
create_most_tables() {
- have "$OUTPUT_DIR/taxons.tsv.gz" || return
+ have "$OUTPUT_DIR/taxons.tsv.lz4" || return
log "Started calculation of most tables."
reportProgress "-1" "Started building main database tables." 5
mkdir -p "$OUTPUT_DIR" "$INTDIR"
- if [ $VERBOSE = "true" ]
- then
- $VERBOSE_FLAG="--verbose"
- fi
-
- cat - | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/TaxonsUniprots2Tables.jar" \
+ cat - | $CURRENT_LOCATION/helper_scripts/taxons-uniprots-tables \
--peptide-min "$PEPTIDE_MIN_LENGTH" \
--peptide-max "$PEPTIDE_MAX_LENGTH" \
- --taxons "$(guz "$OUTPUT_DIR/taxons.tsv.gz")" \
- --peptides "$(gz "$INTDIR/peptides.tsv.gz")" \
- --uniprot-entries "$(gz "$OUTPUT_DIR/uniprot_entries.tsv.gz")" \
- --ec "$(gz "$OUTPUT_DIR/ec_cross_references.tsv.gz")" \
- --go "$(gz "$OUTPUT_DIR/go_cross_references.tsv.gz")" \
- --interpro "$(gz "$OUTPUT_DIR/interpro_cross_references.tsv.gz")" \
- $VERBOSE_FLAG
-
- log "Finished calculation of most tables with status $?"
+ --taxons "$(luz "$OUTPUT_DIR/taxons.tsv.lz4")" \
+ --peptides "$(lz "$INTDIR/peptides-out.tsv.lz4")" \
+ --uniprot-entries "$(lz "$OUTPUT_DIR/uniprot_entries.tsv.lz4")" \
+ --ec "$(lz "$OUTPUT_DIR/ec_cross_references.tsv.lz4")" \
+ --go "$(lz "$OUTPUT_DIR/go_cross_references.tsv.lz4")" \
+ --interpro "$(lz "$OUTPUT_DIR/interpro_cross_references.tsv.lz4")"
+
+ log "Started sorting peptides table"
+
+ $CMD_LZ4CAT $INTDIR/peptides-out.tsv.lz4 \
+ | LC_ALL=C $CMD_SORT -k2 \
+ | $CMD_LZ4 > $INTDIR/peptides-equalized.tsv.lz4
+
+ rm $INTDIR/peptides-out.tsv.lz4
+ log "Finished calculation of most tables with status $?"
}
create_tables_and_filter() {
filter_sources_by_taxa | create_most_tables
}
-join_equalized_pepts_and_entries() {
- echo "Test if files for joining peptides are available."
- have "$INTDIR/peptides.tsv.gz" "$OUTPUT_DIR/uniprot_entries.tsv.gz" || return
- log "Started the joining of equalized peptides and uniprot entries."
- mkfifo "peptides_eq" "entries_eq"
- zcat "$INTDIR/peptides.tsv.gz" | gawk '{ printf("%012d\t%s\n", $4, $2) }' > "peptides_eq" &
- zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $4) }' > "entries_eq" &
- join -t ' ' -o '1.2,2.2' -j 1 "peptides_eq" "entries_eq" \
- | LC_ALL=C $CMD_SORT -k1 \
- | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_equalized.tsv.gz"
- rm "peptides_eq" "entries_eq"
- log "Finished the joining of equalized peptides and uniprot entries with status $?."
-}
+number_sequences() {
+ have "$INTDIR/peptides-equalized.tsv.lz4" || return
+ log "Started the numbering of sequences."
-join_original_pepts_and_entries() {
- have "$INTDIR/peptides.tsv.gz" "$OUTPUT_DIR/uniprot_entries.tsv.gz" || return
- log "Started the joining of original peptides and uniprot entries."
- mkfifo "peptides_orig" "entries_orig"
- zcat "$INTDIR/peptides.tsv.gz" | gawk '{ printf("%012d\t%s\n", $4, $3) }' > "peptides_orig" &
- zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $4) }' > "entries_orig" &
- join -t ' ' -o '1.2,2.2' -j 1 "peptides_orig" "entries_orig" \
- | LC_ALL=C $CMD_SORT -k1 \
- | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_original.tsv.gz"
- rm "peptides_orig" "entries_orig"
- log "Finished the joining of original peptides and uniprot entries with status $?."
-}
+ mkfifo "p_eq"
+ mkfifo "p_or"
+ $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 3 | sort | uniq > "p_or" &
+ $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 | cut -f 2 | uniq > "p_eq" &
+
+ sort -u -m "p_or" "p_eq" | cat -n \
+ | sed 's/^ *//' | $CMD_LZ4 - > "$INTDIR/sequences.tsv.lz4"
+
+ rm "p_eq" "p_or"
-number_sequences() {
- have "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" "$INTDIR/aa_sequence_taxon_original.tsv.gz" || return
- log "Started the numbering of sequences."
- mkfifo "equalized" "original"
- zcat "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" | cut -f1 | uniq > "equalized" &
- zcat "$INTDIR/aa_sequence_taxon_original.tsv.gz" | cut -f1 | uniq > "original" &
- LC_ALL=C $CMD_SORT -m "equalized" "original" | uniq | cat -n \
- | sed 's/^ *//' | $CMD_GZIP - > "$INTDIR/sequences.tsv.gz"
- rm "equalized" "original"
log "Finished the numbering of sequences with status $?."
}
+substitute_aas() {
+ have "$INTDIR/peptides-equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4"
+
+ log "Started the substitution of equalized AA's by ID's for the peptides."
+ $CMD_LZ4CAT $INTDIR/peptides-equalized.tsv.lz4 \
+ | join -t ' ' -o '1.1,2.1,1.3,1.4,1.5,1.6' -1 2 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \
+ | $CMD_LZ4 - > "$INTDIR/peptides_by_equalized.tsv.lz4"
+
+ rm "$INTDIR/peptides-equalized.tsv.lz4"
+ log "Finished the substitution of equalized AA's by ID's for the peptides with status $?."
+
+ log "Started the substitution of original AA's by ID's for the peptides."
+ $CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" \
+ | LC_ALL=C $CMD_SORT -k 3b,3 \
+ | join -t ' ' -o '1.1,1.2,2.1,1.4,1.5,1.6' -1 3 -2 2 - "$(luz "$INTDIR/sequences.tsv.lz4")" \
+ | $CMD_LZ4 - > "$INTDIR/peptides_by_original.tsv.lz4"
+
+ log "Finished the substitution of original AA's by ID's for the peptides with status $?."
+}
calculate_equalized_lcas() {
- have "$INTDIR/sequences.tsv.gz" "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" "$OUTPUT_DIR/lineages.tsv.gz" || return
- log "Started the calculation of equalized LCA's (after substituting AA's by ID's)."
- join -t ' ' -o '1.1,2.2' -1 2 -2 1 \
- "$(guz "$INTDIR/sequences.tsv.gz")" \
- "$(guz "$INTDIR/aa_sequence_taxon_equalized.tsv.gz")" \
- | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/LineagesSequencesTaxons2LCAs.jar" "$(guz "$OUTPUT_DIR/lineages.tsv.gz")" \
- | $CMD_GZIP - > "$INTDIR/LCAs_equalized.tsv.gz"
+ have "$INTDIR/peptides_by_equalized.tsv.lz4" || return
+ log "Started the calculation of equalized LCA's."
+ $CMD_LZ4CAT $INTDIR/peptides_by_equalized.tsv.lz4 | cut -f 2,6 \
+ | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
+ | $CMD_LZ4 - > "$INTDIR/LCAs_equalized.tsv.lz4"
log "Finished the calculation of equalized LCA's (after substituting AA's by ID's) with status $?."
}
calculate_original_lcas() {
- have "$INTDIR/sequences.tsv.gz" "$INTDIR/aa_sequence_taxon_original.tsv.gz" "$OUTPUT_DIR/lineages.tsv.gz" || return
- log "Started the calculation of original LCA's (after substituting AA's by ID's)."
- join -t ' ' -o '1.1,2.2' -1 2 -2 1 \
- "$(guz "$INTDIR/sequences.tsv.gz")" \
- "$(guz "$INTDIR/aa_sequence_taxon_original.tsv.gz")" \
- | java -Xms"$JAVA_MEM" -Xmx"$JAVA_MEM" -jar "$CURRENT_LOCATION/helper_scripts/LineagesSequencesTaxons2LCAs.jar" "$(guz "$OUTPUT_DIR/lineages.tsv.gz")" \
- | $CMD_GZIP - > "$INTDIR/LCAs_original.tsv.gz"
+ have "$INTDIR/peptides_by_original.tsv.lz4" || return
+ log "Started the calculation of original LCA's"
+ $CMD_LZ4CAT $INTDIR/peptides_by_original.tsv.lz4 | cut -f 3,6 \
+ | $CURRENT_LOCATION/helper_scripts/lcas --infile "$(luz "$OUTPUT_DIR/lineages.tsv.lz4")" \
+ | $CMD_LZ4 - > "$INTDIR/LCAs_original.tsv.lz4"
log "Finished the calculation of original LCA's (after substituting AA's by ID's) with status $?."
}
-substitute_equalized_aas() {
- have "$INTDIR/peptides.tsv.gz" "$INTDIR/sequences.tsv.gz" || return
- log "Started the substitution of equalized AA's by ID's for the peptides."
- zcat "$INTDIR/peptides.tsv.gz" \
- | LC_ALL=C $CMD_SORT -k 2b,2 \
- | join -t ' ' -o '1.1,2.1,1.3,1.4,1.5' -1 2 -2 2 - "$(guz "$INTDIR/sequences.tsv.gz")" \
- | $CMD_GZIP - > "$INTDIR/peptides_by_equalized.tsv.gz"
- log "Finished the substitution of equalized AA's by ID's for the peptides with status $?."
-}
-
-
calculate_equalized_fas() {
- have "$INTDIR/peptides_by_equalized.tsv.gz" || return
+ have "$INTDIR/peptides_by_equalized.tsv.lz4" || return
log "Started the calculation of equalized FA's."
mkfifo "peptides_eq"
- zcat "$INTDIR/peptides_by_equalized.tsv.gz" | cut -f2,5 > "peptides_eq" &
- node "$CURRENT_LOCATION/helper_scripts/FunctionalAnalysisPeptides.js" "peptides_eq" "$(gz "$INTDIR/FAs_equalized.tsv.gz")"
+ $CMD_LZ4CAT "$INTDIR/peptides_by_equalized.tsv.lz4" | cut -f2,5 > "peptides_eq" &
+ $CURRENT_LOCATION/helper_scripts/functional-analysis -i "peptides_eq" -o "$(lz "$INTDIR/FAs_equalized.tsv.lz4")"
rm "peptides_eq"
log "Finished the calculation of equalized FA's with status $?."
}
-substitute_original_aas() {
- have "$INTDIR/peptides_by_equalized.tsv.gz" "$INTDIR/sequences.tsv.gz" || return
- log "Started the substitution of original AA's by ID's for the peptides."
- zcat "$INTDIR/peptides_by_equalized.tsv.gz" \
- | LC_ALL=C $CMD_SORT -k 3b,3 \
- | join -t ' ' -o '1.1,1.2,2.1,1.4,1.5' -1 3 -2 2 - "$(guz "$INTDIR/sequences.tsv.gz")" \
- | $CMD_GZIP - > "$INTDIR/peptides_by_original.tsv.gz"
- log "Finished the substitution of equalized AA's by ID's for the peptides with status $?."
-}
-
calculate_original_fas() {
- have "$INTDIR/peptides_by_original.tsv.gz" || return
+ have "$INTDIR/peptides_by_original.tsv.lz4" || return
log "Started the calculation of original FA's."
mkfifo "peptides_orig"
- zcat "$INTDIR/peptides_by_original.tsv.gz" | cut -f3,5 > "peptides_orig" &
- node "$CURRENT_LOCATION/helper_scripts/FunctionalAnalysisPeptides.js" "peptides_orig" "$(gz "$INTDIR/FAs_original.tsv.gz")"
+ $CMD_LZ4CAT "$INTDIR/peptides_by_original.tsv.lz4" | cut -f3,5 > "peptides_orig" &
+ $CURRENT_LOCATION/helper_scripts/functional-analysis -i "peptides_orig" -o "$(lz "$INTDIR/FAs_original.tsv.lz4")"
rm "peptides_orig"
log "Finished the calculation of original FA's."
}
-sort_peptides() {
- have "$INTDIR/peptides_by_original.tsv.gz" || return
- log "Started sorting the peptides table."
- mkdir -p "$OUTPUT_DIR"
- zcat "$INTDIR/peptides_by_original.tsv.gz" \
- | LC_ALL=C $CMD_SORT -n \
- | $CMD_GZIP - > "$OUTPUT_DIR/peptides.tsv.gz"
- log "Finished sorting the peptides table."
-}
create_sequence_table() {
- have "$INTDIR/LCAs_original.tsv.gz" "$INTDIR/LCAs_equalized.tsv.gz" "$INTDIR/FAs_original.tsv.gz" "$INTDIR/FAs_equalized.tsv.gz" "$INTDIR/sequences.tsv.gz" || return
+ have "$INTDIR/LCAs_original.tsv.lz4" "$INTDIR/LCAs_equalized.tsv.lz4" "$INTDIR/FAs_original.tsv.lz4" "$INTDIR/FAs_equalized.tsv.lz4" "$INTDIR/sequences.tsv.lz4" || return
log "Started the creation of the sequences table."
mkdir -p "$OUTPUT_DIR"
mkfifo "olcas" "elcas" "ofas" "efas"
- zcat "$INTDIR/LCAs_original.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "olcas" &
- zcat "$INTDIR/LCAs_equalized.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "elcas" &
- zcat "$INTDIR/FAs_original.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "ofas" &
- zcat "$INTDIR/FAs_equalized.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "efas" &
- zcat "$INTDIR/sequences.tsv.gz" | gawk '{ printf("%012d\t%s\n", $1, $2) }' \
+ $CMD_LZ4CAT "$INTDIR/LCAs_original.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "olcas" &
+ $CMD_LZ4CAT "$INTDIR/LCAs_equalized.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "elcas" &
+ $CMD_LZ4CAT "$INTDIR/FAs_original.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "ofas" &
+ $CMD_LZ4CAT "$INTDIR/FAs_equalized.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' > "efas" &
+ $CMD_LZ4CAT "$INTDIR/sequences.tsv.lz4" | gawk '{ printf("%012d\t%s\n", $1, $2) }' \
| join --nocheck-order -a1 -e '\N' -t ' ' -o "1.1 1.2 2.2" - "olcas" \
| join --nocheck-order -a1 -e '\N' -t ' ' -o "1.1 1.2 1.3 2.2" - "elcas" \
| join --nocheck-order -a1 -e '\N' -t ' ' -o '1.1 1.2 1.3 1.4 2.2' - "ofas" \
| join --nocheck-order -a1 -e '\N' -t ' ' -o '1.1 1.2 1.3 1.4 1.5 2.2' - "efas" \
- | sed 's/^0*//' | $CMD_GZIP - > "$OUTPUT_DIR/sequences.tsv.gz"
+ | sed 's/^0*//' \
+ | awk -F'\t' 'BEGIN {OFS="\t"} {gsub(/Z/, "K", $2); print}' \
+ | $CMD_LZ4 - > "$OUTPUT_DIR/sequences.tsv.lz4"
rm "olcas" "elcas" "ofas" "efas"
log "Finished the creation of the sequences table."
}
@@ -679,7 +661,7 @@ fetch_ec_numbers() {
/^DE/ { gsub(/.$/, "", $2)
name = name $2 }
END { print id, name }'
- } | cat -n | sed 's/^ *//' | $CMD_GZIP - > "$OUTPUT_DIR/ec_numbers.tsv.gz"
+ } | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/ec_numbers.tsv.lz4"
log "Finished creating EC numbers."
}
@@ -711,14 +693,14 @@ fetch_go_terms() {
id++
}
}
- type = "" }' | $CMD_GZIP - > "$OUTPUT_DIR/go_terms.tsv.gz"
+ type = "" }' | $CMD_LZ4 - > "$OUTPUT_DIR/go_terms.tsv.lz4"
log "Finished creating GO terms."
}
fetch_interpro_entries() {
log "Started creating InterPro Entries."
mkdir -p "$OUTPUT_DIR"
- curl -s "$INTERPRO_URL" | grep '^IPR' | cat -n | sed 's/^ *//' | $CMD_GZIP - > "$OUTPUT_DIR/interpro_entries.tsv.gz"
+ curl -s "$INTERPRO_URL" | grep '^IPR' | cat -n | sed 's/^ *//' | $CMD_LZ4 - > "$OUTPUT_DIR/interpro_entries.tsv.lz4"
log "Finished creating InterPro Entries."
}
@@ -728,10 +710,10 @@ fetch_interpro_entries() {
#dot: create_kmer_index -> kmer_index
#dot: kmer_index [color="#f28e2b"]
create_kmer_index() {
- have "$OUTPUT_DIR/uniprot_entries.tsv.gz" "$OUTPUT_DIR/taxons.tsv.gz" || return
+ have "$OUTPUT_DIR/uniprot_entries.tsv.lz4" "$OUTPUT_DIR/taxons.tsv.lz4" || return
log "Started the construction of the $KMER_LENGTH-mer index."
for PREFIX in A C D E F G H I K L M N P Q R S T V W Y; do
- pv -N $PREFIX "$OUTPUT_DIR/uniprot_entries.tsv.gz" \
+ pv -N $PREFIX "$OUTPUT_DIR/uniprot_entries.tsv.lz4" \
| gunzip \
| cut -f4,7 \
| grep "^[0-9]* [ACDEFGHIKLMNPQRSTVWY]*$" \
@@ -740,7 +722,7 @@ create_kmer_index() {
| LC_ALL=C $CMD_SORT \
| sed "s/^/$PREFIX/"
done \
- | umgap joinkmers "$(guz "$OUTPUT_DIR/taxons.tsv.gz")" \
+ | umgap joinkmers "$(guz "$OUTPUT_DIR/taxons.tsv.lz4")" \
| cut -d' ' -f1,2 \
| umgap buildindex \
> "$OUTPUT_DIR/$KMER_LENGTH-mer.index"
@@ -752,9 +734,9 @@ create_kmer_index() {
#dot: create_tryptic_index -> tryptic_index
#dot: tryptic_index [color="#f28e2b"]
create_tryptic_index() {
- have "$TABDIR/sequences.tsv.gz" || return
+ have "$TABDIR/sequences.tsv.lz4" || return
log "Started the construction of the tryptic index."
- pv "$TABDIR/sequences.tsv.gz" \
+ pv "$TABDIR/sequences.tsv.lz4" \
| gunzip \
| cut -f2,3 \
| grep -v "\\N" \
@@ -771,43 +753,31 @@ database)
download_and_convert_all_sources
create_tables_and_filter
echo "Created tables!"
- join_equalized_pepts_and_entries &
- pid1=$!
- join_original_pepts_and_entries &
- pid2=$!
- wait $pid1
- wait $pid2
number_sequences
- reportProgress "-1" "Calculating lowest common ancestors." 6
+ substitute_aas
+ reportProgress "-1" "Calculating lowest common ancestors and functional annotations." 6
calculate_equalized_lcas &
pid1=$!
calculate_original_lcas &
pid2=$!
- wait $pid1
- wait $pid2
- rm "$INTDIR/aa_sequence_taxon_equalized.tsv.gz"
- rm "$INTDIR/aa_sequence_taxon_original.tsv.gz"
- substitute_equalized_aas
- rm "$INTDIR/peptides.tsv.gz"
- substitute_original_aas
- reportProgress "-1" "Calculating functional annotations." 7
calculate_equalized_fas &
- pid1=$!
+ pid3=$!
calculate_original_fas &
- pid2=$!
+ pid4=$!
wait $pid1
wait $pid2
- rm "$INTDIR/peptides_by_equalized.tsv.gz"
- reportProgress "-1" "Sorting peptides." 8
- sort_peptides
- rm "$INTDIR/peptides_by_original.tsv.gz"
+ wait $pid3
+ wait $pid4
reportProgress "-1" "Creating sequence table." 9
create_sequence_table
- rm "$INTDIR/LCAs_original.tsv.gz"
- rm "$INTDIR/LCAs_equalized.tsv.gz"
- rm "$INTDIR/FAs_original.tsv.gz"
- rm "$INTDIR/FAs_equalized.tsv.gz"
- rm "$INTDIR/sequences.tsv.gz"
+ rm "$INTDIR/LCAs_original.tsv.lz4"
+ rm "$INTDIR/LCAs_equalized.tsv.lz4"
+ rm "$INTDIR/FAs_original.tsv.lz4"
+ rm "$INTDIR/FAs_equalized.tsv.lz4"
+ rm "$INTDIR/sequences.tsv.lz4"
+ rm "$INTDIR/peptides_by_equalized.tsv.lz4"
+ # Use the original sort as the result
+ mv "$INTDIR/peptides_by_original.tsv.lz4" "$OUTPUT_DIR/peptides.tsv.lz4"
reportProgress "-1" "Fetching EC numbers." 10
fetch_ec_numbers
reportProgress "-1" "Fetching GO terms." 11
@@ -815,11 +785,11 @@ database)
reportProgress "-1" "Fetching InterPro entries." 12
fetch_interpro_entries
reportProgress "-1" "Computing database indices" 13
- ENTRIES=$(zcat "$OUTPUT_DIR/uniprot_entries.tsv.gz" | wc -l)
+ ENTRIES=$($CMD_LZ4CAT "$OUTPUT_DIR/uniprot_entries.tsv.lz4" | wc -l)
echo "Database contains: ##$ENTRIES##"
;;
static-database)
- if ! have "$TABDIR/taxons.tsv.gz"; then
+ if ! have "$TABDIR/taxons.tsv.lz4"; then
create_taxon_tables
fi
fetch_ec_numbers
@@ -830,10 +800,10 @@ kmer-index)
checkdep pv
checkdep umgap "umgap crate (for umgap buildindex)"
- if ! have "$OUTPUT_DIR/taxons.tsv.gz"; then
+ if ! have "$OUTPUT_DIR/taxons.tsv.lz4"; then
create_taxon_tables
fi
- if ! have "$OUTPUT_DIR/uniprot_entries.tsv.gz"; then
+ if ! have "$OUTPUT_DIR/uniprot_entries.tsv.lz4"; then
download_and_convert_all_sources
create_tables_and_filter
fi
@@ -843,22 +813,20 @@ tryptic-index)
checkdep pv
checkdep umgap "umgap crate (for umgap buildindex)"
- if ! have "$TABDIR/taxons.tsv.gz"; then
+ if ! have "$TABDIR/taxons.tsv.lz4"; then
create_taxon_tables
fi
- if ! have "$TABDIR/sequences.tsv.gz"; then
+ if ! have "$TABDIR/sequences.tsv.lz4"; then
download_and_convert_all_sources
create_tables_and_filter
- join_equalized_pepts_and_entries
- join_original_pepts_and_entries
number_sequences
+ substitute_aas
calculate_equalized_lcas
calculate_original_lcas
- substitute_equalized_aas
calculate_equalized_fas
- substitute_original_aas
calculate_original_fas
create_sequence_table
+ # TODO remove temp files
fi
create_tryptic_index
;;
diff --git a/scripts/helper_scripts/.gitignore b/scripts/helper_scripts/.gitignore
new file mode 100644
index 00000000..38df1f8a
--- /dev/null
+++ b/scripts/helper_scripts/.gitignore
@@ -0,0 +1,9 @@
+# Ignore the compiled binaries that get moved here
+dat-parser
+functional-analysis
+lcas
+taxa-by-chunk
+taxons-lineages
+taxons-uniprots-tables
+write-to-chunk
+xml-parser
diff --git a/scripts/helper_scripts/FunctionalAnalysisPeptides.js b/scripts/helper_scripts/FunctionalAnalysisPeptides.js
deleted file mode 100755
index 2e635468..00000000
--- a/scripts/helper_scripts/FunctionalAnalysisPeptides.js
+++ /dev/null
@@ -1,73 +0,0 @@
-const readline = require('readline');
-const fs = require('fs');
-const start = new Date().getTime();
-const args = process.argv;
-if (args.length !== 4) {
- console.log("Please provide 2 parameters: input and output.");
- process.exit(1);
-}
-const inputFile = args[2];
-const outputFile = args[3];
-const readInterface = readline.createInterface({
- input: fs.createReadStream(inputFile)
-});
-const writer = fs.createWriteStream(outputFile);
-let row = null;
-let curPept = null;
-let numProt = 0;
-let numAnnotatedGO = 0;
-let numAnnotatedEC = 0;
-let numAnnotatedInterPro = 0;
-let done = 0;
-let m = new Map();
-readInterface.on('line', function (line) {
- row = line.split("\t");
- if (row[0] !== curPept) {
- if (curPept !== null) {
- if (m.size !== 0) {
- writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`);
- }
- }
- m.clear();
- numProt = 0;
- numAnnotatedGO = 0;
- numAnnotatedEC = 0;
- numAnnotatedInterPro = 0;
- curPept = row[0];
- }
- numProt++;
- if (row.length > 1) {
- const terms = row[1].split(";");
- let hasEC = false;
- let hasGO = false;
- let hasInterPro = false;
- for (const term of terms) {
- if (!term) {
- continue;
- }
- if (term.startsWith("G")) {
- hasGO = true;
- } else if (term.startsWith("E")) {
- hasEC = true;
- } else {
- hasInterPro = true;
- }
- m.set(term, (m.get(term) || 0) + 1);
- }
- numAnnotatedGO += hasGO ? 1 : 0;
- numAnnotatedEC += hasEC ? 1 : 0;
- numAnnotatedInterPro += hasInterPro ? 1 : 0;
- }
- done++;
- if (done % 1000000 === 0) {
- console.log("FA " + done + " rows");
- }
-});
-readInterface.on('close', function () {
- if (m.size !== 0) {
- writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`);
- }
- writer.end();
- const end = new Date().getTime();
- console.log("Took " + (end - start) / 1000 + "s");
-});
diff --git a/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar b/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar
deleted file mode 100755
index fee277c5..00000000
Binary files a/scripts/helper_scripts/LineagesSequencesTaxons2LCAs.jar and /dev/null differ
diff --git a/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar b/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar
deleted file mode 100644
index 1f3b817d..00000000
Binary files a/scripts/helper_scripts/NamesNodes2TaxonsLineages.jar and /dev/null differ
diff --git a/scripts/helper_scripts/ParallelXmlToTab.js b/scripts/helper_scripts/ParallelXmlToTab.js
deleted file mode 100755
index dd1557eb..00000000
--- a/scripts/helper_scripts/ParallelXmlToTab.js
+++ /dev/null
@@ -1,10 +0,0 @@
-const readline = require('readline');
-const fs = require('fs');
-
-const rl = readline.createInterface({
- input: process.stdin
-});
-
-let buffer = "";
-
-
diff --git a/scripts/helper_scripts/TaxaByChunk.js b/scripts/helper_scripts/TaxaByChunk.js
deleted file mode 100755
index fff59f75..00000000
--- a/scripts/helper_scripts/TaxaByChunk.js
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * This script looks for which taxa should be looked up in which chunk. The list of taxa that need to be looked up is
- * read from stdin. A list of files, taxa (thus the taxa that need to be looked up in the corresponding file) are
- * provided through stdout.
- *
- * The script requires two command line arguments: the folder in which all Unipept DB chunks are present and a
- * temporary folder that can be used by the script to store temporary files.
- */
-
-const readline = require("readline");
-const fs = require("fs");
-const path = require("path");
-
-const args = process.argv;
-
-if (args.length !== 4) {
- console.error("This script expects exactly two parameters: unipept_db_chunk_folder temporary_folder");
- process.exit(1);
-}
-
-const rl = readline.createInterface({
- input: process.stdin
-});
-
-const allTaxa = [];
-
-rl.on("line", (line) => {
- allTaxa.push(parseInt(line.trim()));
-});
-
-// In this hook we should start to link input files with the taxa that need to be looked up in there.
-rl.on("close", () => {
- for (const file of fs.readdirSync(args[2])) {
- const baseFile = path.basename(file);
- if (baseFile.match(/unipept\..*\.gz/)) {
- const range = baseFile.replace(/unipept\.|\.gz/g, '').split("-");
- const startRange = parseInt(range[0]);
- const endRange = parseInt(range[1]);
-
- const matchedTaxa = allTaxa.filter(t => startRange <= t && t <= endRange);
-
- if (matchedTaxa && matchedTaxa.length > 0) {
- fs.writeFileSync(path.join(args[3], baseFile + ".pattern"), matchedTaxa.map(t => "\t" + t + "$").join("\n"));
-
- console.log(path.join(args[3], baseFile + ".pattern"));
- console.log(path.join(args[2], file));
- }
- }
- }
-});
diff --git a/scripts/helper_scripts/TaxonsUniprots2Tables.jar b/scripts/helper_scripts/TaxonsUniprots2Tables.jar
deleted file mode 100644
index bf9f13e8..00000000
Binary files a/scripts/helper_scripts/TaxonsUniprots2Tables.jar and /dev/null differ
diff --git a/scripts/helper_scripts/WriteToChunk.js b/scripts/helper_scripts/WriteToChunk.js
deleted file mode 100755
index 594ae4a5..00000000
--- a/scripts/helper_scripts/WriteToChunk.js
+++ /dev/null
@@ -1,49 +0,0 @@
-const readline = require("readline");
-const fs = require("fs");
-const path = require("path");
-
-const outputDir = process.argv[2];
-
-const verbose = process.argv[3] === "true";
-
-const rl = readline.createInterface({
- input: process.stdin
-});
-
-const taxaBounds = [
- 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819,
- 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149,
- 1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669,
- 2706029, 10000000
-];
-
-const fileObjects = [...Object.keys(taxaBounds)].slice(0, -1).map(idx => Number.parseInt(idx)).map(
- idx => fs.createWriteStream(path.join(outputDir, `unipept.${taxaBounds[idx]}-${taxaBounds[idx + 1]}.chunk`))
-);
-
-let headerSkipped = false;
-
-rl.on("line", (line) => {
- if (verbose) {
- console.error("INFO VERBOSE: writing line to chunk: " + line);
- }
-
- if (!headerSkipped) {
- headerSkipped = true;
- const writeStream = fs.createWriteStream(path.join(outputDir, 'db.header'));
- writeStream.write(line + "\n");
- writeStream.close();
- return;
- }
-
- const taxonId = Number.parseInt(line.split("\t")[8].trim());
-
- let idx = 0;
- while (taxonId > taxaBounds[idx]) {
- idx++;
- }
-
- fileObjects[idx - 1].write(line + "\n");
-});
-
-rl.on("close", () => fileObjects.map(o => o.close()));
diff --git a/scripts/helper_scripts/XmlToTabConverter.jar b/scripts/helper_scripts/XmlToTabConverter.jar
deleted file mode 100644
index 734e2498..00000000
Binary files a/scripts/helper_scripts/XmlToTabConverter.jar and /dev/null differ
diff --git a/scripts/helper_scripts/filter_taxa.sh b/scripts/helper_scripts/filter_taxa.sh
index 017a512d..b9b4e76a 100755
--- a/scripts/helper_scripts/filter_taxa.sh
+++ b/scripts/helper_scripts/filter_taxa.sh
@@ -14,7 +14,7 @@ mkdir -p "$TMP_DIR"
filter_taxa() {
QUERY=$(echo "\s$1\s" | sed "s/,/\\\s\\\|\\\s/g")
- RESULT=$(cat "$LINEAGE_ARCHIVE" | zcat | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',')
+ RESULT=$(lz4 -dc "$LINEAGE_ARCHIVE" | grep "$QUERY" | cut -f1 | sort -n | uniq | tr '\n' ',')
echo "$RESULT"
}
@@ -23,16 +23,16 @@ then
TAXA=$(filter_taxa "$TAXA")
# This associative array maps a filename upon the taxa that should be queried within this file
- QUERIES=( $(echo "$TAXA" | tr "," "\n" | node "$CURRENT_LOCATION/TaxaByChunk.js" "$DATABASE_INDEX" "$TMP_DIR") )
+ QUERIES=( $(echo "$TAXA" | tr "," "\n" | $CURRENT_LOCATION/taxa-by-chunk --chunk-dir "$DATABASE_INDEX" --temp-dir "$TMP_DIR") )
if [[ ${#QUERIES[@]} -gt 0 ]]
then
- parallel --jobs 8 --max-args 2 "cat {2} | zcat | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}"
+ parallel --jobs 8 --max-args 2 "lz4 -dc {2} | sed 's/$/$/' | grep -F -f {1} | sed 's/\$$//'" ::: "${QUERIES[@]}"
fi
else
# If the root ID has been passed to this script, we simply print out all database items (without filtering).
- find "$DATABASE_INDEX" -name "*.chunk.gz" | xargs zcat
+ find "$DATABASE_INDEX" -name "*.chunk.lz4" -exec lz4 -mdc {} +
fi
# Remove temporary files
diff --git a/scripts/helper_scripts/parser/pom.xml b/scripts/helper_scripts/parser/pom.xml
deleted file mode 100644
index a8fa31d7..00000000
--- a/scripts/helper_scripts/parser/pom.xml
+++ /dev/null
@@ -1,107 +0,0 @@
-
- 4.0.0
- unipept
- unipept
- 0.0.1-SNAPSHOT
-
- UTF-8
-
-
-
-
- ${basedir}/src/main/java
-
-
-
-
- ${basedir}/src/test/java
-
-
- ${basedir}/src/test/resources
-
- **/*.*
-
-
-
-
-
- maven-compiler-plugin
- 3.1
-
-
- 1.8
-
-
-
- org.apache.maven.plugins
- maven-dependency-plugin
-
-
- copy-dependencies
- prepare-package
-
- copy-dependencies
-
-
- ${project.build.directory}/lib
- false
- false
- true
-
-
-
-
-
- org.apache.maven.plugins
- maven-jar-plugin
- 2.4
-
-
-
- true
- lib/
-
-
-
-
-
-
-
-
- junit
- junit
- 4.11
-
-
- com.beust
- jcommander
- 1.48
-
-
- javax.json
- javax.json-api
- 1.1
-
-
-
- org.glassfish
- javax.json
- 1.1
-
-
-
-
- oracleReleases
- Oracle Released Java Packages
- http://download.oracle.com/maven
- default
-
-
- Unipept
- https://github.ugent.be/bmesuere/unipept
- The Unipept web application supports biodiversity analysis of large and complex metaproteome samples.
-
- https://github.ugent.be/bmesuere/unipept.git
-
-
diff --git a/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF
deleted file mode 100755
index d6c5d70b..00000000
--- a/scripts/helper_scripts/parser/src/manifests/LineagesSequencesTaxons2LCAs/META-INF/MANIFEST.MF
+++ /dev/null
@@ -1,3 +0,0 @@
-Manifest-Version: 1.0
-Main-Class: tools.LineagesSequencesTaxons2LCAs
-
diff --git a/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF
deleted file mode 100755
index 143dcc30..00000000
--- a/scripts/helper_scripts/parser/src/manifests/NamesNodes2TaxonsLineages/META-INF/MANIFEST.MF
+++ /dev/null
@@ -1,3 +0,0 @@
-Manifest-Version: 1.0
-Main-Class: tools.NamesNodes2TaxonsLineages
-
diff --git a/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF
deleted file mode 100755
index 1f0472ba..00000000
--- a/scripts/helper_scripts/parser/src/manifests/TaxonsUniprots2Tables/META-INF/MANIFEST.MF
+++ /dev/null
@@ -1,3 +0,0 @@
-Manifest-Version: 1.0
-Main-Class: tools.TaxonsUniprots2Tables
-
diff --git a/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF b/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF
deleted file mode 100755
index 87a3f9e2..00000000
--- a/scripts/helper_scripts/parser/src/manifests/XmlToTabConverter/META-INF/MANIFEST.MF
+++ /dev/null
@@ -1,3 +0,0 @@
-Manifest-Version: 1.0
-Main-Class: tools.XmlToTabConverter
-
diff --git a/scripts/helper_scripts/parser/src/storage/CSV.java b/scripts/helper_scripts/parser/src/storage/CSV.java
deleted file mode 100755
index 1e5e9587..00000000
--- a/scripts/helper_scripts/parser/src/storage/CSV.java
+++ /dev/null
@@ -1,93 +0,0 @@
-package storage;
-
-import java.io.IOException;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-
-public class CSV {
-
- private static final int MB4 = 4194304;
-
- public static class Reader {
- private BufferedReader buffer;
-
- public Reader(String file) throws IOException {
- buffer = new BufferedReader(
- new InputStreamReader(
- new FileInputStream(file)
- )
- );
- }
-
- public String[] read() throws IOException {
- String line = buffer.readLine();
- if(line == null) return null;
- return line.split(" ");
- }
-
- public void close() throws IOException {
- buffer.close();
- }
- }
-
- public static class Writer {
- protected BufferedWriter buffer;
-
- public Writer(String file) throws IOException {
- buffer = new BufferedWriter(
- new OutputStreamWriter(
- new FileOutputStream(file)
- ), MB4
- );
- }
-
- public void write(String... values) throws IOException {
- buffer.write(values[0]);
- for(int i = 1; i < values.length; i++) {
- buffer.write(" " + (values[i] == null ? "\\N" : values[i]));
- }
- buffer.newLine();
- }
-
- public void close() throws IOException {
- buffer.close();
- }
- }
-
- public static class IndexedWriter extends Writer {
- private long index;
-
- public IndexedWriter(String file) throws IOException {
- super(file);
- index = 0;
- }
-
- @Override
- public void write(String... values) throws IOException {
- buffer.write(Long.toString(++index));
- for(int i = 0; i < values.length; i++) {
- buffer.write(" " + (values[i] == null ? "\\N" : values[i]));
- }
- buffer.newLine();
- }
-
- public long index() {
- return index;
- }
- }
-
- public static String toString(boolean b) {
- return b ? "\1" : "\0";
- }
-
- public static boolean toBoolean(String b) {
- return b.charAt(0) == (char) 1;
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/storage/TabWriter.java b/scripts/helper_scripts/parser/src/storage/TabWriter.java
deleted file mode 100755
index 191ff47e..00000000
--- a/scripts/helper_scripts/parser/src/storage/TabWriter.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package storage;
-
-import xml.*;
-
-import java.io.*;
-import java.util.stream.Collectors;
-
-public class TabWriter implements UniprotObserver {
- private final BufferedWriter out;
- private final boolean verbose;
-
- public TabWriter(
- OutputStream out,
- boolean verbose
- ) throws IOException {
- this.out = new BufferedWriter(new OutputStreamWriter(out));
- this.verbose = verbose;
-
- // Write header to output file
- this.out.write(String.join("\t", new String[]{
- "Entry",
- "Sequence",
- "Protein names",
- "Version (entry)",
- "EC number",
- "Gene ontology IDs",
- "Cross-reference (InterPro)",
- "Status",
- "Organism ID"
- }) + "\n");
- }
-
- @Override
- public void handleEntry(UniprotEntry entry) {
- try {
- String line = String.join("\t", new String[]{
- entry.getUniprotAccessionNumber(),
- entry.getSequence(),
- entry.getName(),
- String.valueOf(entry.getVersion()),
- entry.getECReferences().stream().map(UniprotECRef::getId).collect(Collectors.joining(";")),
- entry.getGOReferences().stream().map(UniprotGORef::getId).collect(Collectors.joining(";")),
- entry.getInterProReferences().stream().map(UniprotInterProRef::getId).collect(Collectors.joining(";")),
- "swissprot",
- String.valueOf(entry.getTaxonId()),
- });
-
- if (verbose) {
- System.err.println("INFO VERBOSE: Writing tabular line: " + line);
- }
-
- this.out.write(line + "\n");
- } catch (IOException e) {
- System.err.println("Could not write to output stream.");
- }
- }
-
- @Override
- public void close() {
- try {
- this.out.close();
- } catch (IOException e) {
- System.err.println("Could not correctly close output stream.");
- }
- }
-}
diff --git a/scripts/helper_scripts/parser/src/storage/TableWriter.java b/scripts/helper_scripts/parser/src/storage/TableWriter.java
deleted file mode 100755
index 98de7a3f..00000000
--- a/scripts/helper_scripts/parser/src/storage/TableWriter.java
+++ /dev/null
@@ -1,249 +0,0 @@
-package storage;
-
-import taxons.TaxonList;
-import tools.TaxonsUniprots2Tables;
-import xml.*;
-
-import java.util.Set;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.io.IOException;
-import java.io.File;
-import java.sql.Timestamp;
-
-
-/**
- * Intermediate class to add PeptideData to the database
- *
- * @author Bart Mesuere
- * @author Felix Van der Jeugt
- *
- */
-public class TableWriter implements UniprotObserver {
-
- public static final String[] ranks = new String[]{"taxon_id", "superkingdom", "kingdom", "subkingdom", "superphylum", "phylum", "subphylum","superclass", "class", "subclass", "superorder", "order", "suborder", "infraorder", "superfamily", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "species_group", "species_subgroup", "species", "subspecies", "strain", "varietas", "forma"};
- private static final Map rankIndices = new HashMap<>();
-
- static {
- for(int i = 0; i < ranks.length; i++) {
- rankIndices.put(ranks[i], i);
- }
- }
-
- private TaxonList taxonList;
- private Set wrongTaxonIds;
-
- // csv files
- private CSV.IndexedWriter peptides;
- private CSV.IndexedWriter uniprotEntries;
- private CSV.IndexedWriter goCrossReferences;
- private CSV.IndexedWriter ecCrossReferences;
- private CSV.IndexedWriter interProCrossReferences;
-
- /**
- * Creates a new data object
- */
- public TableWriter(TaxonsUniprots2Tables args) {
- wrongTaxonIds = new HashSet();
-
- /* Opening CSV files for writing. */
- try {
- taxonList = TaxonList.loadFromFile(args.taxonsFile);
- peptides = new CSV.IndexedWriter(args.peptidesFile);
- uniprotEntries = new CSV.IndexedWriter(args.uniprotEntriesFile);
- ecCrossReferences = new CSV.IndexedWriter(args.ecCrossReferencesFile);
- goCrossReferences = new CSV.IndexedWriter(args.goCrossReferencesFile);
- interProCrossReferences = new CSV.IndexedWriter(args.interProCrossReferencesFile);
- } catch(IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error creating tsv files");
- e.printStackTrace();
- System.exit(1);
- }
-
- }
-
- /**
- * Stores a complete UniprotEntry in the database
- *
- * @param entry
- * the UniprotEntry to store
- */
- public void store(UniprotEntry entry) {
- long uniprotEntryId = addUniprotEntry(entry.getUniprotAccessionNumber(), entry.getVersion(),
- entry.getTaxonId(), entry.getType(), entry.getName(), entry.getSequence());
- if (uniprotEntryId != -1) { // failed to add entry
- String faSummary = Stream.of(
- entry.getGOReferences().stream().map(UniprotGORef::getId),
- entry.getECReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"EC:"+x.getId()),
- entry.getInterProReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"IPR:"+x.getId())
- ).flatMap(i -> i).collect(Collectors.joining(";"));
-
- for(String sequence : entry.digest()) {
- addData(sequence.replace('I', 'L'), uniprotEntryId, sequence, faSummary);
- }
- for (UniprotGORef ref : entry.getGOReferences())
- addGORef(ref, uniprotEntryId);
- for (UniprotECRef ref : entry.getECReferences())
- addECRef(ref, uniprotEntryId);
- for (UniprotInterProRef ref : entry.getInterProReferences())
- addInterProRef(ref, uniprotEntryId);
- }
- }
-
- /**
- *
- * Inserts the entry info of a uniprot entry into the database and returns
- * the generated id.
- *
- * @param uniprotAccessionNumber
- * The accession number of the entry
- * @param version
- * The version of the entry
- * @param taxonId
- * The taxonId of the organism of the entry
- * @param type
- * The type of the entry. Can be swissprot or trembl
- * @param sequence
- * The full sequence of the peptide.
- * @return The database ID of the uniprot entry.
- */
- public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxonId,
- String type, String name, String sequence) {
- if(0 <= taxonId && taxonId < taxonList.size() && taxonList.get(taxonId) != null) {
- try {
- uniprotEntries.write(
- uniprotAccessionNumber,
- Integer.toString(version),
- Integer.toString(taxonId),
- type,
- name,
- sequence
- );
- return uniprotEntries.index();
- } catch(IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error writing to CSV.");
- e.printStackTrace();
- }
- } else {
- if (!wrongTaxonIds.contains(taxonId)) {
- wrongTaxonIds.add(taxonId);
- System.err.println(new Timestamp(System.currentTimeMillis()) + " " + taxonId
- + " added to the list of " + wrongTaxonIds.size() + " invalid taxonIds.");
- }
- }
- return -1;
- }
-
- /**
- * Adds peptide data to the database
- *
- * @param unifiedSequence
- * The sequence of the peptide with AA's I and L the
- * same.
- * @param uniprotEntryId
- * The id of the uniprot entry from which the peptide data was
- * retrieved.
- * @param originalSequence
- * The original sequence of the peptide.
- * @param functionalAnnotations
- * A semicollon separated list of allocated functional analysis terms
- */
- public void addData(String unifiedSequence, long uniprotEntryId, String originalSequence, String functionalAnnotations) {
- try {
- peptides.write(
- unifiedSequence,
- originalSequence,
- Long.toString(uniprotEntryId),
- functionalAnnotations
- );
- } catch(IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error adding this peptide to the database: " + unifiedSequence);
- e.printStackTrace();
- }
- }
-
- /**
- * Adds a uniprot entry GO reference to the database
- *
- * @param ref
- * The uniprot GO reference to add
- * @param uniprotEntryId
- * The uniprotEntry of the cross reference
- */
- public void addGORef(UniprotGORef ref, long uniprotEntryId) {
- try {
- goCrossReferences.write(Long.toString(uniprotEntryId), ref.getId());
- } catch (IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error adding this GO reference to the database.");
- e.printStackTrace();
- }
-
- }
-
- /**
- * Adds a uniprot entry EC reference to the database
- *
- * @param ref
- * The uniprot EC reference to add
- * @param uniprotEntryId
- * The uniprotEntry of the cross reference
- */
- public void addECRef(UniprotECRef ref, long uniprotEntryId) {
- try {
- ecCrossReferences.write(Long.toString(uniprotEntryId), ref.getId());
- } catch (IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error adding this EC reference to the database.");
- e.printStackTrace();
- }
-
- }
-
- /**
- * Adds a uniprot entry InterPro reference to the database
- *
- * @param ref
- * The uniprot InterPro reference to add
- * @param uniprotEntryId
- * The uniprotEntry of the cross reference
- */
- public void addInterProRef(UniprotInterProRef ref, long uniprotEntryId) {
- try {
- interProCrossReferences.write(Long.toString(uniprotEntryId), ref.getId());
- } catch (IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Error adding this InterPro reference to the database.");
- e.printStackTrace();
- }
- }
-
- @Override
- public void handleEntry(UniprotEntry entry) {
- store(entry);
- }
-
- @Override
- public void close() {
- try {
- uniprotEntries.close();
- peptides.close();
- goCrossReferences.close();
- ecCrossReferences.close();
- interProCrossReferences.close();
- } catch(IOException e) {
- System.err.println(new Timestamp(System.currentTimeMillis())
- + " Something closing the csv files.");
- e.printStackTrace();
- }
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/taxons/Taxon.java b/scripts/helper_scripts/parser/src/taxons/Taxon.java
deleted file mode 100755
index 43eeb376..00000000
--- a/scripts/helper_scripts/parser/src/taxons/Taxon.java
+++ /dev/null
@@ -1,55 +0,0 @@
-package taxons;
-
-import java.util.EnumMap;
-import java.util.Map;
-
-public class Taxon {
-
- final public String name;
- final public Rank rank;
- final public int parent;
-
- public boolean valid;
-
- public Taxon(String name, Rank rank, int parent) {
- this.name = name;
- this.rank = rank;
- this.parent = parent;
- this.valid = true;
- }
-
- public void invalidate() {
- this.valid = false;
- }
-
- public boolean valid() {
- return this.valid;
- }
-
-
- public static enum Rank {
- NO_RANK, SUPERKINGDOM, KINGDOM, SUBKINGDOM, SUPERPHYLUM, PHYLUM, SUBPHYLUM, SUPERCLASS, CLASS, SUBCLASS, SUPERORDER, ORDER, SUBORDER, INFRAORDER, SUPERFAMILY, FAMILY, SUBFAMILY, TRIBE, SUBTRIBE, GENUS, SUBGENUS, SPECIES_GROUP, SPECIES_SUBGROUP, SPECIES, SUBSPECIES, STRAIN, VARIETAS, FORMA;
-
- public static final Rank[] values = Rank.values();
-
- private static final Map indices = new EnumMap(Rank.class);
- static {
- for(int i = 0; i < values.length; i++) {
- indices.put(values[i], i);
- }
- }
-
- public int index() {
- return indices.get(this);
- }
-
- public String toString() {
- return this.name().toLowerCase().replace('_', ' ');
- }
-
- public static Rank fromString(String s) {
- return valueOf(s.toUpperCase().replace(' ', '_'));
- }
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/taxons/TaxonList.java b/scripts/helper_scripts/parser/src/taxons/TaxonList.java
deleted file mode 100755
index f0173f2d..00000000
--- a/scripts/helper_scripts/parser/src/taxons/TaxonList.java
+++ /dev/null
@@ -1,172 +0,0 @@
-package taxons;
-
-import storage.CSV;
-
-import java.util.ArrayList;
-import java.util.regex.Pattern;
-import java.io.FileReader;
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.FileNotFoundException;
-
-public class TaxonList extends ArrayList {
-
- private static final Pattern PATTERN = Pattern.compile("\\|");
-
- public TaxonList() {
- super();
- }
-
- public static TaxonList loadFromFile(String filename) throws IOException {
- TaxonList tl = new TaxonList();
- CSV.Reader reader = new CSV.Reader(filename);
- String[] row = null;
- while((row = reader.read()) != null) {
- int id = Integer.parseInt(row[0]);
- Taxon t = new Taxon(
- row[1],
- Taxon.Rank.fromString(row[2]),
- Integer.parseInt(row[3])
- );
- if(! CSV.toBoolean(row[4])) t.invalidate();
- while(tl.size() <= id) tl.add(null);
- tl.set(id, t);
- }
- return tl;
- }
-
- public static TaxonList parseDumps(String namesFile, String nodesFile)
- throws FileNotFoundException, IOException {
- TaxonList tl = new TaxonList();
- BufferedReader names = new BufferedReader(new FileReader(namesFile));
- BufferedReader nodes = new BufferedReader(new FileReader(nodesFile));
-
- String nodeline = null;
- while((nodeline = nodes.readLine()) != null) {
- String[] noderow = PATTERN.split(nodeline);
- int taxon_id = Integer.parseInt(noderow[0].trim());
- int parent_id = Integer.parseInt(noderow[1].trim());
- Taxon.Rank rank = Taxon.Rank.fromString(noderow[2].trim());
-
- String nameline = null;
- String name = null, clas = null;
- int taxon_id2 = -1;
- while(!"scientific name".equals(clas) && (nameline = names.readLine()) != null) {
- String[] namerow = PATTERN.split(nameline);
- taxon_id2 = Integer.parseInt(namerow[0].trim());
- name = namerow[1].trim();
- clas = namerow[3].trim();
- }
-
- if("scientific name".equals(clas) && taxon_id == taxon_id2) {
- while(tl.size() <= taxon_id) tl.add(null);
- tl.set(taxon_id, new Taxon(name, rank, parent_id));
- } else {
- throw new RuntimeException("Taxon " + taxon_id +
- " did not have a scientific name.");
- }
- }
-
- names.close();
- nodes.close();
-
- return tl;
-
- }
-
- public void invalidate() {
- for(int i = 0; i < size(); i++) validate(i);
- }
-
- private boolean validate(int taxon_id) {
- Taxon t = get(taxon_id);
-
- if(t == null) return false;
-
- if(! t.valid()
- || (t.rank == Taxon.Rank.SPECIES
- && (
- (t.name.matches(".*\\d.*") && !t.name.contains("virus"))
- || t.name.endsWith(" sp.")
- || t.name.endsWith(" genomosp.")
- || t.name.contains(" bacterium")
- )
- )
- || t.name.contains("enrichment culture")
- || t.name.contains("mixed culture")
- || t.name.contains("uncultured")
- || t.name.contains("unidentified")
- || t.name.contains("unspecified")
- || t.name.contains("undetermined")
- || t.name.contains("sample")
- || t.name.endsWith("metagenome")
- || t.name.endsWith("library")
- || taxon_id == 28384
- || taxon_id == 48479
- || taxon_id == 1869227) {
- t.invalidate();
- return false;
- }
-
- if(taxon_id == 1) return true;
-
- if(! validate(t.parent)) t.invalidate();
- return t.valid();
- }
-
- public void writeToFile(String filename) throws IOException {
- CSV.Writer writer = new CSV.Writer(filename);
- for(int i = 0; i < size(); i++) {
- Taxon t = get(i);
- if(t != null) writer.write(Integer.toString(i), t.name,
- t.rank.toString(), Integer.toString(t.parent),
- CSV.toString(t.valid()));
- }
- writer.close();
- }
-
- public void writeLineagesToFile(String filename) throws IOException {
- CSV.Writer writer = new CSV.Writer(filename);
- int nranks = Taxon.Rank.values.length;
-
- for(int i = 0; i < size(); i++) {
- Taxon t = get(i);
- if(t == null) continue;
-
- // +1 want - no_rank + lineage_id + taxon_id
- String[] lineage = new String[nranks];
- lineage[0] = Integer.toString(i);
-
- int tid = rankedAncestor(i);
- t = get(tid);
- boolean valid = t.valid();
- for(int j = nranks - 1; j >= 1; j--) {
- if(j > t.rank.index()) {
- lineage[j] = valid ? null : "-1";
- } else {
- valid = t.valid();
- lineage[j] = Integer.toString((valid ? 1 : -1) * tid);
- tid = rankedAncestor(t.parent);
- t = get(tid);
- }
- }
-
- writer.write(lineage);
- }
-
- writer.close();
- }
-
- private int rankedAncestor(int tid) {
- Taxon t = get(tid);
- int pid = -1;
- while(t != null && tid != pid && t.rank == Taxon.Rank.NO_RANK) {
- pid = tid;
- tid = t.parent;
- t = get(tid);
- }
- if(t != null) return tid;
- return 1; // only used in case a taxon is no descendant of root
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java b/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java
deleted file mode 100755
index 67cf3630..00000000
--- a/scripts/helper_scripts/parser/src/tools/LineagesSequencesTaxons2LCAs.java
+++ /dev/null
@@ -1,141 +0,0 @@
-package tools;
-
-import java.io.*;
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.regex.Pattern;
-
-public class LineagesSequencesTaxons2LCAs {
-
- public static final int GENUS = 18;
- public static final int SPECIES = 22;
- public static final int RANKS = 27;
- private static final Pattern SEPARATOR = Pattern.compile("\t");
- private static final String NULL = "\\N";
- private int[][] taxonomy;
- private final Writer writer;
-
- public LineagesSequencesTaxons2LCAs(String taxonomyFile) throws IOException {
- writer = new BufferedWriter(new OutputStreamWriter(System.out, "utf-8"));
- buildTaxonomy(taxonomyFile);
- }
-
- private void buildTaxonomy(String file) throws FileNotFoundException, IOException {
- HashMap taxonomyMap = new HashMap<>();
- InputStream is = new FileInputStream(new File(file));
- BufferedReader br = new BufferedReader(new InputStreamReader(is));
-
- br.lines()
- .forEach(line -> {
- String[] elements = SEPARATOR.split(line, 28);
-
- int key = Integer.parseInt(elements[0]);
- int[] lineage = Arrays.stream(elements)
- .skip(1)// skip taxonId
- .mapToInt(s -> s.toUpperCase().equals("\\N") ? 0 : Integer.parseInt(s))
- .toArray();
-
- taxonomyMap.put(key, lineage);
- });
-
- int max = taxonomyMap.keySet().stream().max(Integer::compare).get();
- taxonomy = new int[max + 1][];
- taxonomyMap.keySet().stream().forEach(key -> taxonomy[key] = taxonomyMap.get(key));
- }
-
- public void calculateLCAs() throws IOException {
- BufferedReader br = new BufferedReader(new InputStreamReader(System.in), 67108864);
-
- int count = 0;
- String currentSequence = null;
- Collection taxa = new ArrayList<>();
- String line;
- while ((line = br.readLine()) != null) {
- count++;
- if (count % 10000000 == 0) {
- System.err.println(new Timestamp(System.currentTimeMillis()) + ": " + count);
- }
-
- // outperforms split by at least 20%
- int t = line.indexOf('\t');
- String sequence = line.substring(0, t);
- int taxonId = Integer.parseInt(line.substring(t + 1));
-
- if (currentSequence == null || !currentSequence.equals(sequence)) {
- if (currentSequence != null) {
- handleLCA(currentSequence, calculateLCA(taxa));
- }
-
- currentSequence = sequence;
- taxa.clear();
- }
-
- taxa.add(taxonId);
- }
- handleLCA(currentSequence, calculateLCA(taxa));
- }
-
- private int calculateLCA(Collection taxa) {
- int lca = 1;
- int[][] lineages = taxa.stream()
- .map(t -> taxonomy[t])
- .filter(l -> l != null)
- .toArray(int[][]::new);
- for (int rank = 0; rank < RANKS; rank++) {
- final int finalRank = rank;
- final int[] val = {-1};
- boolean allMatch = Arrays.stream(lineages)
- .mapToInt(l -> l[finalRank])
- .filter(i -> finalRank == GENUS || finalRank == SPECIES ? i > 0 : i >= 0)
- .peek(i -> val[0] = val[0] == -1 ? i : val[0])
- .allMatch(i -> i == val[0]);
-
- if (val[0] != -1) {
- if (!allMatch) {
- break;
- }
- if (val[0] != 0) {
- lca = val[0];
- }
- }
- }
- return lca;
- }
-
- private void handleLCA(String sequence, int lca) {
- try {
- writer.write(sequence + "\t" + lca + '\n');
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public void close() throws IOException {
- writer.close();
- }
-
- /**
- * first argument should be the lineages in tsv format without a header row. Create by running:
- * $ echo "select * from lineages;" | mysql -u unipept -p unipept | sed 1d > lineages.tsv
- *
- * standard input should be the peptides in tsv format with a header row. Create by running:
- * $ echo "select sequence_id, taxon_id from peptides left join uniprot_entries on peptides.uniprot_entry_id = uniprot_entries.id;" | \n
- * mysql -u unipept -p unipept -q | sort -S 50% --parallel=12 -k1n > sequences.tsv
- *
- * @param args
- */
- public static void main(String... args) {
- try {
- System.err.println(new Timestamp(System.currentTimeMillis()) + ": reading taxonomy");
- LineagesSequencesTaxons2LCAs l = new LineagesSequencesTaxons2LCAs(args[0]);
- System.err.println(new Timestamp(System.currentTimeMillis()) + ": reading sequences");
- l.calculateLCAs();
- l.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-}
diff --git a/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java b/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java
deleted file mode 100755
index 06a72e56..00000000
--- a/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java
+++ /dev/null
@@ -1,36 +0,0 @@
-package tools;
-
-import java.io.IOException;
-import java.io.FileNotFoundException;
-
-import com.beust.jcommander.Parameter;
-import com.beust.jcommander.JCommander;
-import taxons.TaxonList;
-
-public class NamesNodes2TaxonsLineages {
-
- @Parameter(names="--names", description="Taxon names input file") public String namesFile;
- @Parameter(names="--nodes", description="Taxon nodes input file") public String nodesFile;
- @Parameter(names="--taxons", description="Taxon TSV output file") public String taxonsFile;
- @Parameter(names="--lineages", description="Lineages TSV output file") public String lineagesFile;
-
- /**
- * Parse a list of taxons and their lineages from the NCBI dumps.
- *
- * This program will parse the first two argument files, and create the next
- * two. The first two arguments are the nodes.dmp and names.dmp files
- * downloaded from the NCBI. TSV-dumps of the parsed taxons and lineages
- * will be written to the third and fourth parameter.
- */
- public static void main(String[] args) throws IOException {
- NamesNodes2TaxonsLineages main = new NamesNodes2TaxonsLineages();
- new JCommander(main, args);
-
- TaxonList tl = TaxonList.parseDumps(main.namesFile, main.nodesFile);
- tl.invalidate();
- tl.writeToFile(main.taxonsFile);
- tl.writeLineagesToFile(main.lineagesFile);
- }
-
-}
-
diff --git a/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java b/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java
deleted file mode 100644
index 14763ee7..00000000
--- a/scripts/helper_scripts/parser/src/tools/TaxonsUniprots2Tables.java
+++ /dev/null
@@ -1,49 +0,0 @@
-package tools;
-
-import java.io.IOException;
-
-import com.beust.jcommander.Parameter;
-import com.beust.jcommander.JCommander;
-
-import storage.TableWriter;
-import tsv.UniprotTabParser;
-
-public class TaxonsUniprots2Tables {
-
- @Parameter(names="--peptide-min", description="Minimum peptide length") public int peptideMin;
- @Parameter(names="--peptide-max", description="Maximum peptide length") public int peptideMax;
- @Parameter(names="--taxons", description="Taxons TSV input file") public String taxonsFile;
- @Parameter(names="--peptides", description="Peptides TSV output file") public String peptidesFile;
- @Parameter(names="--uniprot-entries", description="Uniprot entries TSV output file") public String uniprotEntriesFile;
- @Parameter(names="--ec", description="EC references TSV output file") public String ecCrossReferencesFile;
- @Parameter(names="--go", description="GO references TSV output file") public String goCrossReferencesFile;
- @Parameter(names="--interpro", description="InterPro references TSV output file") public String interProCrossReferencesFile;
- @Parameter(names="--verbose", description="Enable verbose mode") public boolean verboseMode;
-
- /**
- * Parse the UniProt TSV-file into TSV tables.
- *
- * The first parameter is a taxon file, as written by NamesNodes2Taxons. The next 5 parameters are the output files,
- * all in TSV format. In order, they are: the peptides, the uniprot entries, the EC cross references, the GO cross
- * references and the InterPro cross references.
- *
- * This program reads input from stdin and writes output to the files indicated by the parameters given above.
- */
- public static void main(String[] args) throws IOException {
- TaxonsUniprots2Tables main = new TaxonsUniprots2Tables();
- new JCommander(main, args);
-
- if (main.verboseMode) {
- System.err.println("INFO: TaxonsUniprots2Tables - Verbose mode enabled.");
- }
-
- TableWriter writer = new TableWriter(main);
-
- UniprotTabParser parser = new UniprotTabParser();
- parser.parse(main.peptideMin, main.peptideMax, System.in, writer, main.verboseMode);
-
- writer.close();
- }
-
-}
-
diff --git a/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java b/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java
deleted file mode 100755
index ab045f19..00000000
--- a/scripts/helper_scripts/parser/src/tools/XmlToTabConverter.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package tools;
-
-import org.xml.sax.SAXException;
-import storage.TabWriter;
-import xml.UniprotHandler;
-
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import java.io.*;
-
-/**
- * This tool accepts 3 different arguments:
- * peptide_min_length, peptide_max_length, database_type_name
- *
- * The input is read from stdin and the output of this script is written to stdout.
- *
- * This tool's job is to produce a TSV-file with the same contents as the XML-file that's fed into this script.
- */
-public class XmlToTabConverter {
- public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
- SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
-
- InputStream uniprotStream = System.in;
- UniprotHandler handler = new UniprotHandler(Integer.parseInt(args[0]), Integer.parseInt(args[1]), args[2]);
-
- TabWriter writer = new TabWriter(System.out, Boolean.parseBoolean(args[3]));
- handler.addObserver(writer);
-
- parser.parse(uniprotStream, handler);
-
- uniprotStream.close();
- writer.close();
- }
-}
diff --git a/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java b/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java
deleted file mode 100755
index 0d057cb8..00000000
--- a/scripts/helper_scripts/parser/src/tsv/UniprotTabParser.java
+++ /dev/null
@@ -1,77 +0,0 @@
-package tsv;
-
-import xml.*;
-
-import java.io.*;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.stream.Stream;
-
-public class UniprotTabParser {
- public void parse(
- int peptideMinLength,
- int peptideMaxLength,
- InputStream input,
- UniprotObserver observer,
- boolean verbose
- ) throws IOException {
- BufferedReader reader = new BufferedReader(new InputStreamReader(input));
-
- String line = reader.readLine().trim();
- String[] header = Stream.of(line.split("\t")).map(String::trim).toArray(String[]::new);
-
- Map headerMap = new HashMap();
- for (int i = 0; i < header.length; i++) {
- headerMap.put(header[i], i);
- }
-
- line = reader.readLine();
-
- while (line != null) {
- if (verbose) {
- System.err.println("INFO VERBOSE: TSV line parsed: " + line);
- }
-
- String[] fields = line.trim().split("\t");
-
- try {
- // We need to emit one new UniprotEntry per line in the input
- UniprotEntry entry = new UniprotEntry(fields[headerMap.get("Status")].trim(), peptideMinLength, peptideMaxLength);
-
- // Now convert all fields into the correct Uniprot entry properties
- entry.setUniprotAccessionNumber(fields[headerMap.get("Entry")]);
- entry.setSequence(fields[headerMap.get("Sequence")].trim());
-
- entry.setRecommendedName(fields[headerMap.get("Protein names")].trim());
- // Todo, does not always need to be set?
- // entry.setSubmittedName("name");
-
- entry.setVersion(Integer.parseInt(fields[headerMap.get("Version (entry)")].trim()));
-
- for (String ecNumber : fields[headerMap.get("EC number")].split(";")) {
- entry.addECRef(new UniprotECRef(ecNumber.trim()));
- }
-
- for (String goTerm : fields[headerMap.get("Gene ontology IDs")].split(";")) {
- entry.addGORef(new UniprotGORef(goTerm.trim()));
- }
-
- for (String interpro : fields[headerMap.get("Cross-reference (InterPro)")].split(";")) {
- entry.addInterProRef(new UniprotInterProRef(interpro.trim()));
- }
-
- entry.setTaxonId(Integer.parseInt(fields[headerMap.get("Organism ID")]));
-
- // Emit entry that's finished and handle it...
- observer.handleEntry(entry);
- } catch (Exception e) {
- System.err.println("Invalid entry ignored: " + line);
- System.err.println("Invalid entry error details: " + e.getMessage());
- }
-
- line = reader.readLine();
- }
-
- reader.close();
- }
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java b/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java
deleted file mode 100755
index cdbd58cd..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotDbRef.java
+++ /dev/null
@@ -1,39 +0,0 @@
-package xml;
-
-public class UniprotDbRef {
-
- private String type;
- private String sequenceId;
- private String proteinId;
-
- public UniprotDbRef(String type, String sequenceId, String proteinId) {
- this.type = type;
- this.sequenceId = sequenceId;
- this.proteinId = proteinId;
- }
-
- public UniprotDbRef(String type) {
- this.type = type;
- }
-
- public String getType() {
- return type;
- }
-
- public String getSequenceId() {
- return sequenceId;
- }
-
- public void setSequenceId(String sequenceId) {
- this.sequenceId = sequenceId;
- }
-
- public String getProteinId() {
- return proteinId;
- }
-
- public void setProteinId(String proteinId) {
- this.proteinId = proteinId;
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotECRef.java b/scripts/helper_scripts/parser/src/xml/UniprotECRef.java
deleted file mode 100755
index 4adba41f..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotECRef.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package xml;
-
-public class UniprotECRef {
-
- private String id;
-
- public UniprotECRef(String id) {
- this.id = id;
- }
-
- public String getId() {
- return id;
- }
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotEntry.java b/scripts/helper_scripts/parser/src/xml/UniprotEntry.java
deleted file mode 100755
index 870442da..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotEntry.java
+++ /dev/null
@@ -1,162 +0,0 @@
-package xml;
-
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.stream.Stream;
-
-/**
- * @author Bart Mesuere
- *
- */
-public class UniprotEntry {
-
- // peptide settings
- private final int peptideMin;
- private final int peptideMax;
-
- private String uniprotAccessionNumber;
- private int version;
- private int taxonId;
- private String type;
- private String recommendedName;
- private String submittedName;
- private String sequence;
- private List dbReferences;
- private List goReferences;
- private List ecReferences;
- private List interProReferences;
- private List sequences;
-
- public UniprotEntry(String type, int peptideMin, int peptideMax) {
- this.type = type;
- this.peptideMin = peptideMin;
- this.peptideMax = peptideMax;
- dbReferences = new ArrayList();
- goReferences = new ArrayList();
- ecReferences = new ArrayList();
- interProReferences = new ArrayList();
- sequences = new ArrayList();
- }
-
- public void reset(String type) {
- uniprotAccessionNumber = null;
- version = 0;
- taxonId = 0;
- this.type = type;
- recommendedName = null;
- submittedName = null;
- sequence = null;
- dbReferences.clear();
- goReferences.clear();
- ecReferences.clear();
- interProReferences.clear();
- sequences.clear();
- }
-
- public String getUniprotAccessionNumber() {
- return uniprotAccessionNumber;
- }
-
- public void setUniprotAccessionNumber(String uniprotAccessionNumber) {
- if(this.uniprotAccessionNumber == null) {
- this.uniprotAccessionNumber = uniprotAccessionNumber;
- }
- }
-
- public int getVersion() {
- return version;
- }
-
- public void setVersion(int version) {
- this.version = version;
- }
-
- public int getTaxonId() {
- return taxonId;
- }
-
- public void setTaxonId(int taxonId) {
- this.taxonId = taxonId;
- }
-
- public String getType() {
- return type;
- }
-
- public String getName() {
- if(recommendedName != null) return recommendedName;
- return submittedName;
- }
-
- public void setRecommendedName(String name) {
- recommendedName = name;
- }
-
- public void setSubmittedName(String name) {
- submittedName = name;
- }
-
- public String getSequence() {
- return sequence;
- }
-
- public void setSequence(String sequence) {
- this.sequence = sequence.replace(" ", "");
- }
-
- public void addDbRef(UniprotDbRef ref) {
- dbReferences.add(ref);
- }
-
- public void addGORef(UniprotGORef ref) {
- goReferences.add(ref);
- }
-
- public void addECRef(UniprotECRef ref) {
- ecReferences.add(ref);
- }
-
- public void addInterProRef(UniprotInterProRef ref) { interProReferences.add(ref); }
-
- public List digest() {
- sequences.clear();
- int start = 0;
- int length = sequence.length();
- for (int i = 0; i < length; i++) {
- char x = sequence.charAt(i);
- if ((x == 'K' || x == 'R') && (i + 1 < length && sequence.charAt(i + 1) != 'P')) {
- if (i + 1 - start >= peptideMin && i + 1 - start <= peptideMax) {
- sequences.add(sequence.substring(start, i + 1));
- }
- start = i + 1;
- }
- }
- if (length - start >= peptideMin && length - start <= peptideMax) {
- sequences.add(sequence.substring(start, length));
- }
- return sequences;
- }
-
- public List getDbReferences() {
- return dbReferences;
- }
-
- public List getGOReferences() {
- return goReferences;
- }
-
- public List getECReferences() {
- return ecReferences;
- }
-
- public List getInterProReferences(){ return interProReferences; }
-
-
- @Override
- public String toString() {
- return uniprotAccessionNumber + ", " + version + ", " + taxonId + ", " + type + ", "
- + sequence;
- }
-
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotGORef.java b/scripts/helper_scripts/parser/src/xml/UniprotGORef.java
deleted file mode 100755
index d1c909f1..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotGORef.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package xml;
-
-public class UniprotGORef {
-
- private String id;
-
- public UniprotGORef(String id) {
- this.id = id;
- }
-
- public String getId() {
- return id;
- }
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotHandler.java b/scripts/helper_scripts/parser/src/xml/UniprotHandler.java
deleted file mode 100755
index 8164d162..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotHandler.java
+++ /dev/null
@@ -1,249 +0,0 @@
-package xml;
-
-import java.sql.Timestamp;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class UniprotHandler extends DefaultHandler {
-
- private final String uniprotType;
-
- private UniprotEntry currentItem;
- private UniprotDbRef dbRef;
- private UniprotGORef goRef;
- private UniprotECRef ecRef;
- private UniprotInterProRef interProRef;
- private StringBuilder charData;
- private int i;
- private boolean inComment = false;
- private boolean inOrganism = false;
- private boolean inEvidence = false;
- private boolean inRecommendedName = false;
- private boolean inSubmittedName = false;
- private List observers;
-
- private Map endTagWorkers;
- private Map startTagWorkers;
-
- public UniprotHandler(int peptideMinLength, int peptideMaxLength, String uniprotType) {
- super();
- this.uniprotType = uniprotType;
- currentItem = new UniprotEntry(uniprotType, peptideMinLength, peptideMaxLength);
- charData = new StringBuilder();
- observers = new ArrayList();
-
- // set up end tag workers
- endTagWorkers = new HashMap();
- endTagWorkers.put("entry", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- emitEntry(currentItem);
- }
- });
- endTagWorkers.put("accession", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- currentItem.setUniprotAccessionNumber(data);
- }
- });
- endTagWorkers.put("organism", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- inOrganism = false;
- }
- });
- endTagWorkers.put("evidence", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- inEvidence = false;
- }
- });
- endTagWorkers.put("recommendedName", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- inRecommendedName = false;
- }
- });
- endTagWorkers.put("submittedName", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- inSubmittedName = false;
- }
- });
- endTagWorkers.put("sequence", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- currentItem.setSequence(data);
- }
- });
- endTagWorkers.put("dbReference", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- if (inComment) {
- return;
- }
-
- if (!inOrganism) {
- if (dbRef != null) {
- currentItem.addDbRef(dbRef);
- dbRef = null;
- } else if (goRef != null) {
- currentItem.addGORef(goRef);
- goRef = null;
- } else if (ecRef != null) {
- currentItem.addECRef(ecRef);
- ecRef = null;
- } else if (interProRef != null) {
- currentItem.addInterProRef(interProRef);
- interProRef = null;
- }
- }
- }
- });
- endTagWorkers.put("fullName", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- if (inRecommendedName) {
- currentItem.setRecommendedName(data);
- } else if (inSubmittedName) {
- currentItem.setSubmittedName(data);
- }
- }
- });
- endTagWorkers.put("comment", new EndTagWorker() {
- @Override
- public void handleTag(String data) {
- inComment = false;
- }
- });
-
- // set up start tag workers
- startTagWorkers = new HashMap();
- startTagWorkers.put("entry", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- newCurrentItem();
- currentItem.setVersion(Integer.valueOf(atts.getValue("version")));
- }
- });
- startTagWorkers.put("organism", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- inOrganism = true;
- }
- });
- startTagWorkers.put("evidence", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- inEvidence = true;
- }
- });
- startTagWorkers.put("recommendedName", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- inRecommendedName = true;
- }
- });
- startTagWorkers.put("submittedName", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- inSubmittedName = true;
- }
- });
- startTagWorkers.put("dbReference", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- // Skip references if they are embedded in comments (otherwise, these could cause duplicate identifiers)
- if (inComment) {
- return;
- }
-
- if (inOrganism) {
- if (atts.getValue("type").equals("NCBI Taxonomy"))
- currentItem.setTaxonId(Integer.valueOf(atts.getValue("id")));
- } else if (!inEvidence) {
- if (atts.getValue("type").equals("EMBL")) {
- dbRef = new UniprotDbRef("EMBL");
- dbRef.setSequenceId(atts.getValue("id"));
- } else if (atts.getValue("type").equals("RefSeq")) {
- dbRef = new UniprotDbRef("RefSeq");
- dbRef.setProteinId(atts.getValue("id"));
- } else if (atts.getValue("type").equals("GO")) {
- goRef = new UniprotGORef(atts.getValue("id"));
- } else if (atts.getValue("type").equals("EC")) {
- ecRef = new UniprotECRef(atts.getValue("id"));
- } else if (atts.getValue("type").equals("InterPro")) {
- interProRef = new UniprotInterProRef(atts.getValue("id"));
- }
- }
- }
- });
- startTagWorkers.put("property", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- if (dbRef != null) {
- if (atts.getValue("type").equals("protein sequence ID"))
- dbRef.setProteinId(atts.getValue("value"));
- else if (atts.getValue("type").equals("nucleotide sequence ID"))
- dbRef.setSequenceId(atts.getValue("value"));
- }
- }
- });
- startTagWorkers.put("comment", new StartTagWorker() {
- @Override
- public void handleTag(Attributes atts) {
- inComment = true;
- }
- });
- }
-
- @Override
- public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
- StartTagWorker worker = startTagWorkers.get(qName);
- if (worker != null) {
- worker.handleTag(atts);
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- EndTagWorker worker = endTagWorkers.get(qName);
- if (worker != null) {
- worker.handleTag(charData.toString().trim());
- }
- charData.delete(0, charData.length());
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
- charData.append(ch, start, length);
- }
-
- private void newCurrentItem() {
- currentItem.reset(uniprotType);
- }
-
- private interface StartTagWorker {
- void handleTag(Attributes att);
- }
-
- private interface EndTagWorker {
- void handleTag(String data);
- }
-
- public void addObserver(UniprotObserver o) {
- observers.add(o);
- }
-
- private void emitEntry(UniprotEntry entry) {
- for (UniprotObserver o : observers) {
- o.handleEntry(entry);
- }
- }
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java b/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java
deleted file mode 100755
index 87211e91..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotInterProRef.java
+++ /dev/null
@@ -1,14 +0,0 @@
-package xml;
-
-public class UniprotInterProRef {
-
- private String id;
-
- public UniprotInterProRef(String id) {
- this.id = id;
- }
-
- public String getId() {
- return id;
- }
-}
diff --git a/scripts/helper_scripts/parser/src/xml/UniprotObserver.java b/scripts/helper_scripts/parser/src/xml/UniprotObserver.java
deleted file mode 100755
index 45bf4bc2..00000000
--- a/scripts/helper_scripts/parser/src/xml/UniprotObserver.java
+++ /dev/null
@@ -1,8 +0,0 @@
-package xml;
-
-import xml.UniprotEntry;
-
-public interface UniprotObserver {
- public void handleEntry(UniprotEntry entry);
- public void close();
-}
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
index 007716d3..7e9e3c6b 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/dat-parser.rs
@@ -2,7 +2,6 @@ use anyhow::{Context, Result};
use clap::Parser;
use unipept_database::dat_parser::uniprot_dat_parser;
use unipept_database::dat_parser::utils::write_header;
-use unipept_database::uniprot::UniprotType;
use unipept_database::utils::files::open_sin;
@@ -24,8 +23,8 @@ fn main() -> Result<()> {
#[derive(Parser, Debug)]
struct Cli {
- #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)]
- db_type: UniprotType,
+ #[clap(short = 't', long, default_value = "swissprot")]
+ db_type: String,
#[clap(long, default_value_t = 0)]
threads: usize,
}
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
index 64a20225..a19958ff 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
@@ -31,7 +31,7 @@ fn main() -> Result<()> {
all_taxa.push(taxa_id);
}
- let chunk_file_regex = Regex::new(r"unipept\..*\.gz").context("Error creating regex")?;
+ let chunk_file_regex = Regex::new(r"unipept\..*\.lz4").context("Error creating regex")?;
for entry in read_dir(&args.chunk_dir).context("Error reading chunk directory")? {
let entry = entry.context("Error reading entry from chunk directory")?;
@@ -52,7 +52,7 @@ fn main() -> Result<()> {
}
// Parse the taxa range out of the filename
- let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", "");
+ let replaced_name = base_name.replace("unipept.", "").replace(".chunk.lz4", "");
let range = replaced_name.split_once('-');
let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?;
let start: u64 = range
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
index 9e102cb6..b10ae097 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/xml-parser.rs
@@ -4,7 +4,6 @@ use std::num::NonZeroUsize;
use anyhow::{Context, Result};
use clap::Parser;
use smartstring::{LazyCompact, SmartString};
-use unipept_database::uniprot::UniprotType;
use uniprot::uniprot::{SequentialParser, ThreadedParser};
use unipept_database::utils::files::open_sin;
@@ -50,8 +49,8 @@ type SmartStr = SmartString;
// Parse a Uniprot XML file and convert it into a TSV-file
#[derive(Parser, Debug)]
struct Cli {
- #[clap(value_enum, short = 't', long, default_value_t = UniprotType::Swissprot)]
- uniprot_type: UniprotType,
+ #[clap(short = 't', long, default_value = "swissprot")]
+ uniprot_type: String,
#[clap(long, default_value_t = 0)]
threads: u32,
#[clap(short, long, default_value_t = false)]
@@ -123,7 +122,7 @@ fn parse_name(entry: &uniprot::uniprot::Entry) -> SmartStr {
}
/// Write a single UniProt entry to stdout
-fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose: bool) {
+fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &str, verbose: bool) {
let accession_number: SmartStr = entry.accessions[0].clone();
let sequence: SmartStr = entry.sequence.value.clone();
@@ -165,7 +164,7 @@ fn write_entry(entry: &uniprot::uniprot::Entry, db_type: &UniprotType, verbose:
SmartStr::from(ec_references.join(";")),
SmartStr::from(go_references.join(";")),
SmartStr::from(ip_references.join(";")),
- SmartStr::from(db_type.to_str()),
+ SmartStr::from(db_type),
taxon_id,
];
diff --git a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
index 4d22fd78..0c2745a6 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/dat_parser/entry.rs
@@ -1,8 +1,5 @@
-use std::collections::HashSet;
-
use anyhow::Context;
-
-use crate::uniprot::UniprotType;
+use std::collections::HashSet;
// Constants to aid in parsing
const COMMON_PREFIX_LEN: usize = "ID ".len();
@@ -49,7 +46,7 @@ impl UniProtDATEntry {
}
/// Write an entry to stdout
- pub fn write(&self, db_type: &UniprotType) {
+ pub fn write(&self, db_type: &str) {
if self.name.is_empty() {
eprintln!(
"Could not find a name for entry AC-{}",
@@ -66,7 +63,7 @@ impl UniProtDATEntry {
self.ec_references.join(";"),
self.go_references.join(";"),
self.ip_references.join(";"),
- db_type.to_str(),
+ db_type,
self.taxon_id
)
}
diff --git a/scripts/helper_scripts/unipept-database-rs/src/lib.rs b/scripts/helper_scripts/unipept-database-rs/src/lib.rs
index 497dd646..9b309f52 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/lib.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/lib.rs
@@ -2,5 +2,4 @@ pub mod calculate_lcas;
pub mod dat_parser;
pub mod taxons_lineages;
pub mod taxons_uniprots_tables;
-pub mod uniprot;
pub mod utils;
diff --git a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs
index ec9dd1ab..d92b9a15 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/taxons_uniprots_tables/table_writer.rs
@@ -112,6 +112,7 @@ impl TableWriter {
id,
sequence,
&summary,
+ entry.taxon_id,
)
.context("Failed to write peptide")?;
}
@@ -125,17 +126,19 @@ impl TableWriter {
id: i64,
original_sequence: &[u8],
annotations: &String,
+ taxon_id: i32,
) -> Result<()> {
self.peptide_count += 1;
writeln!(
&mut self.peptides,
- "{}\t{}\t{}\t{}\t{}",
+ "{}\t{}\t{}\t{}\t{}\t{}",
self.peptide_count,
String::from_utf8_lossy(&sequence),
String::from_utf8_lossy(original_sequence),
id,
- annotations
+ annotations,
+ taxon_id
)
.context("Error writing to TSV")?;
diff --git a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs b/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs
deleted file mode 100644
index ae293ece..00000000
--- a/scripts/helper_scripts/unipept-database-rs/src/uniprot/mod.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-/// Enum for the different kinds of databases
-#[derive(clap::ValueEnum, Clone, Debug)]
-pub enum UniprotType {
- Swissprot,
- Trembl,
-}
-
-impl UniprotType {
- pub fn to_str(&self) -> &str {
- match self {
- UniprotType::Swissprot => "swissprot",
- UniprotType::Trembl => "trembl",
- }
- }
-}
diff --git a/scripts/parallel_load.sh b/scripts/parallel_load.sh
index 88224212..fcbfc571 100755
--- a/scripts/parallel_load.sh
+++ b/scripts/parallel_load.sh
@@ -1,5 +1,4 @@
shopt -s expand_aliases
-alias zcat="pigz -cd"
export db=unipept
export user=root
@@ -9,16 +8,16 @@ dir="$1"
function load_table() {
file=$1
- tbl=`echo $file | sed "s/.tsv.gz//"`
- echo "zcatting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl"
- zcat $file | mariadb --local-infile=1 -u$user -p$pass $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1
+ tbl=`echo $file | sed "s/.tsv.lz4//"`
+ echo "lz4catting - LOAD DATA LOCAL INFILE '$file' INTO TABLE $tbl"
+ lz4 -dc $file | mysql --local-infile=1 -u$user -p$pass $db -e "LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE $tbl;SHOW WARNINGS" 2>&1
}
export -f load_table
cd "$dir"
-parallel load_table ::: *.tsv.gz
+parallel load_table ::: *.tsv.lz4
cd "-"