diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index bd26eb0..af3ca42 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -3,7 +3,7 @@ FROM mysql:5.7 LABEL maintainer="Pieter Verschaffelt " RUN mkdir -p /usr/share/man/man1 -RUN apt-get update && apt-get install -y default-jdk git dos2unix maven make wget unzip expect gawk binutils gcc libssl-dev +RUN apt-get update && apt-get install -y default-jdk git dos2unix maven make wget unzip expect gawk binutils gcc libssl-dev uuid-runtime pv nodejs # Now, build curl from source to upgrade it to version 7.68.0 (instead of the default 7.52.0 in this image) # This fixes https://github.com/curl/curl/issues/3750 which is present in older versions of curl diff --git a/.devcontainer/data/.gitkeep b/.devcontainer/data/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/.devcontainer/scripts/create_database.sh b/.devcontainer/scripts/create_database.sh index 9c390c6..715892a 100644 --- a/.devcontainer/scripts/create_database.sh +++ b/.devcontainer/scripts/create_database.sh @@ -19,13 +19,7 @@ if test -f "/tables/uniprot_entries.tsv.gz"; then else echo "Data needs to be regenerated." # The generation of umgap-data is not supported at this time by this Docker container. - sed -i '/checkdep umgap/d' configure - sed -i '/all: makefile database index/d' makefile.in - cp /scripts/script.exp script.exp - chmod +x script.exp - chmod +x configure - ./script.exp - make + ./run.sh database # Move the data that has been generated to the tables volume. If this file is rerun afterwards, the data does not # need to be regenerated. diff --git a/js_tools/FunctionalAnalysisPeptides.js b/js_tools/FunctionalAnalysisPeptides.js new file mode 100644 index 0000000..2e63546 --- /dev/null +++ b/js_tools/FunctionalAnalysisPeptides.js @@ -0,0 +1,73 @@ +const readline = require('readline'); +const fs = require('fs'); +const start = new Date().getTime(); +const args = process.argv; +if (args.length !== 4) { + console.log("Please provide 2 parameters: input and output."); + process.exit(1); +} +const inputFile = args[2]; +const outputFile = args[3]; +const readInterface = readline.createInterface({ + input: fs.createReadStream(inputFile) +}); +const writer = fs.createWriteStream(outputFile); +let row = null; +let curPept = null; +let numProt = 0; +let numAnnotatedGO = 0; +let numAnnotatedEC = 0; +let numAnnotatedInterPro = 0; +let done = 0; +let m = new Map(); +readInterface.on('line', function (line) { + row = line.split("\t"); + if (row[0] !== curPept) { + if (curPept !== null) { + if (m.size !== 0) { + writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`); + } + } + m.clear(); + numProt = 0; + numAnnotatedGO = 0; + numAnnotatedEC = 0; + numAnnotatedInterPro = 0; + curPept = row[0]; + } + numProt++; + if (row.length > 1) { + const terms = row[1].split(";"); + let hasEC = false; + let hasGO = false; + let hasInterPro = false; + for (const term of terms) { + if (!term) { + continue; + } + if (term.startsWith("G")) { + hasGO = true; + } else if (term.startsWith("E")) { + hasEC = true; + } else { + hasInterPro = true; + } + m.set(term, (m.get(term) || 0) + 1); + } + numAnnotatedGO += hasGO ? 1 : 0; + numAnnotatedEC += hasEC ? 1 : 0; + numAnnotatedInterPro += hasInterPro ? 1 : 0; + } + done++; + if (done % 1000000 === 0) { + console.log("FA " + done + " rows"); + } +}); +readInterface.on('close', function () { + if (m.size !== 0) { + writer.write(`${curPept}\t{"num":{"all":${numProt},"EC":${numAnnotatedEC},"GO":${numAnnotatedGO},"IPR":${numAnnotatedInterPro}},"data":{${Array.from(m.entries(), ([k, v]) => `"${k}":${v}`).join(",")}}}\n`); + } + writer.end(); + const end = new Date().getTime(); + console.log("Took " + (end - start) / 1000 + "s"); +}); diff --git a/make-on-hpc.sh b/make-on-hpc.sh index 0be82fd..c1df1f6 100644 --- a/make-on-hpc.sh +++ b/make-on-hpc.sh @@ -12,6 +12,7 @@ # Loading the required modules module load Java module load Maven +module load nodejs # Check the current directory pushd "$PBS_O_WORKDIR" diff --git a/run.sh b/run.sh index e036e9c..1408630 100755 --- a/run.sh +++ b/run.sh @@ -16,7 +16,6 @@ JAVA_MEM="6g" # How much memory should Java use? ENTREZ_BATCH_SIZE=1000 # Which batch size should I use for communication with Entrez? CMD_SORT="sort --buffer-size=80% --parallel=4" # Which sort command should I use? CMD_GZIP="gzip -" # Which pipe compression command should I use? - SOURCES='swissprot https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz' #trembl https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.xml.gz' @@ -167,13 +166,13 @@ create_most_tables() { join_equalized_pepts_and_entries() { have "$INTDIR/peptides.tsv.gz" "$TABDIR/uniprot_entries.tsv.gz" || return log "Started the joining of equalized peptides and uniprot entries." - mkfifo "$TMP/peptides" "$TMP/entries" - zcat "$INTDIR/peptides.tsv.gz" | awk '{ printf("%012d\t%s\n", $4, $2) }' > "$TMP/peptides" & - zcat "$TABDIR/uniprot_entries.tsv.gz" | awk '{ printf("%012d\t%s\n", $1, $4) }' > "$TMP/entries" & - join -t ' ' -o '1.2,2.2' -j 1 "$TMP/peptides" "$TMP/entries" \ + mkfifo "$TMP/peptides_eq" "$TMP/entries_eq" + zcat "$INTDIR/peptides.tsv.gz" | awk '{ printf("%012d\t%s\n", $4, $2) }' > "$TMP/peptides_eq" & + zcat "$TABDIR/uniprot_entries.tsv.gz" | awk '{ printf("%012d\t%s\n", $1, $4) }' > "$TMP/entries_eq" & + join -t ' ' -o '1.2,2.2' -j 1 "$TMP/peptides_eq" "$TMP/entries_eq" \ | LC_ALL=C $CMD_SORT -k1 \ | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" - rm "$TMP/peptides" "$TMP/entries" + rm "$TMP/peptides_eq" "$TMP/entries_eq" log "Finished the joining of equalized peptides and uniprot entries." } @@ -181,13 +180,13 @@ join_equalized_pepts_and_entries() { join_original_pepts_and_entries() { have "$INTDIR/peptides.tsv.gz" "$TABDIR/uniprot_entries.tsv.gz" || return log "Started the joining of original peptides and uniprot entries." - mkfifo "$TMP/peptides" "$TMP/entries" - zcat "$INTDIR/peptides.tsv.gz" | awk '{ printf("%012d\t%s\n", $4, $3) }' > "$TMP/peptides" & - zcat "$TABDIR/uniprot_entries.tsv.gz" | awk '{ printf("%012d\t%s\n", $1, $4) }' > "$TMP/entries" & - join -t ' ' -o '1.2,2.2' -j 1 "$TMP/peptides" "$TMP/entries" \ + mkfifo "$TMP/peptides_orig" "$TMP/entries_orig" + zcat "$INTDIR/peptides.tsv.gz" | awk '{ printf("%012d\t%s\n", $4, $3) }' > "$TMP/peptides_orig" & + zcat "$TABDIR/uniprot_entries.tsv.gz" | awk '{ printf("%012d\t%s\n", $1, $4) }' > "$TMP/entries_orig" & + join -t ' ' -o '1.2,2.2' -j 1 "$TMP/peptides_orig" "$TMP/entries_orig" \ | LC_ALL=C $CMD_SORT -k1 \ | $CMD_GZIP - > "$INTDIR/aa_sequence_taxon_original.tsv.gz" - rm "$TMP/peptides" "$TMP/entries" + rm "$TMP/peptides_orig" "$TMP/entries_orig" log "Finished the joining of original peptides and uniprot entries." } @@ -243,10 +242,10 @@ substitute_equalized_aas() { calculate_equalized_fas() { have "$INTDIR/peptides_by_equalized.tsv.gz" || return log "Started the calculation of equalized FA's." - mkfifo "$TMP/peptides" - zcat "$INTDIR/peptides_by_equalized.tsv.gz" | cut -f2,5 > "$TMP/peptides" & - java_ FunctionAnalysisPeptides "$TMP/peptides" "$(gz "$INTDIR/FAs_equalized.tsv.gz")" - rm "$TMP/peptides" + mkfifo "$TMP/peptides_eq" + zcat "$INTDIR/peptides_by_equalized.tsv.gz" | cut -f2,5 > "$TMP/peptides_eq" & + node js_tools/FunctionalAnalysisPeptides.js "$TMP/peptides_eq" "$(gz "$INTDIR/FAs_equalized.tsv.gz")" + rm "$TMP/peptides_eq" log "Finished the calculation of equalized FA's." } @@ -265,10 +264,10 @@ substitute_original_aas() { calculate_original_fas() { have "$INTDIR/peptides_by_original.tsv.gz" || return log "Started the calculation of original FA's." - mkfifo "$TMP/peptides" - zcat "$INTDIR/peptides_by_original.tsv.gz" | cut -f3,5 > "$TMP/peptides" & - java_ FunctionAnalysisPeptides "$TMP/peptides" "$(gz "$INTDIR/FAs_original.tsv.gz")" - rm "$TMP/peptides" + mkfifo "$TMP/peptides_orig" + zcat "$INTDIR/peptides_by_original.tsv.gz" | cut -f3,5 > "$TMP/peptides_orig" & + node js_tools/FunctionalAnalysisPeptides.js "$TMP/peptides_orig" "$(gz "$INTDIR/FAs_original.tsv.gz")" + rm "$TMP/peptides_orig" log "Finished the calculation of original FA's." } @@ -472,19 +471,31 @@ database) create_taxon_tables download_sources create_most_tables - join_equalized_pepts_and_entries - join_original_pepts_and_entries + join_equalized_pepts_and_entries & + pid1=$! + join_original_pepts_and_entries & + pid2=$! + wait $pid1 + wait $pid2 number_sequences - calculate_equalized_lcas + calculate_equalized_lcas & + pid1=$! + calculate_original_lcas & + pid2=$! + wait $pid1 + wait $pid2 rm "$INTDIR/aa_sequence_taxon_equalized.tsv.gz" - calculate_original_lcas rm "$INTDIR/aa_sequence_taxon_original.tsv.gz" substitute_equalized_aas rm "$INTDIR/peptides.tsv.gz" - calculate_equalized_fas substitute_original_aas + calculate_equalized_fas & + pid1=$! + calculate_original_fas & + pid2=$! + wait $pid1 + wait $pid2 rm "$INTDIR/peptides_by_equalized.tsv.gz" - calculate_original_fas sort_peptides rm "$INTDIR/peptides_by_original.tsv.gz" create_sequence_table diff --git a/src/main/java/org/unipept/tools/FunctionAnalysisPeptides.java b/src/main/java/org/unipept/tools/FunctionAnalysisPeptides.java deleted file mode 100644 index 579e353..0000000 --- a/src/main/java/org/unipept/tools/FunctionAnalysisPeptides.java +++ /dev/null @@ -1,125 +0,0 @@ -package org.unipept.tools; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Collectors; -import org.unipept.storage.CSV; - -/** - * Creates a list of (peptide. functional JSON)-pairs separated by a \t. - * - * Assumes the file to be sorted on the first col (peptide). - */ -public class FunctionAnalysisPeptides { - - public static void main(String[] args) throws IOException { - if (args.length != 2) { - throw new RuntimeException("Please provide 2 parameters. (input,output)"); - } - - String inputPepts = args[0]; - String peptidesList = args[1]; - - CSV.Reader reader = new CSV.Reader(inputPepts); - CSV.Writer writer = new CSV.Writer(peptidesList); - - Map m = new HashMap<>(); - - String[] row = null; - String curPept = null; - int numProt = 0; - int numAnnotatedGO = 0; - int numAnnotatedEC = 0; - int numAnnotatedInterPro = 0; - long done = 0; - while ((row = reader.read()) != null) { - - if (!row[0].equals(curPept)) { - if (curPept != null) { - if (!m.isEmpty()) { - writer.write(curPept, extracted(m, numProt, numAnnotatedGO, numAnnotatedEC, numAnnotatedInterPro)); - } - } - m.clear(); - numProt = 0; - numAnnotatedGO = 0; - numAnnotatedEC = 0; - numAnnotatedInterPro = 0; - curPept = row[0]; - } - numProt++; - - if (row.length > 1) { - String[] terms = row[1].split(";"); - - boolean hasEC = false; - boolean hasGO = false; - boolean hasInterPro = false; - - for (String term : terms) { - hasGO |= term.startsWith("GO"); - hasEC |= term.startsWith("EC"); - hasInterPro |= term.startsWith("IPR"); - m.put(term, m.getOrDefault(term, 0) + 1); - } - numAnnotatedGO += hasGO ? 1 : 0; - numAnnotatedEC += hasEC ? 1 : 0; - numAnnotatedInterPro += hasInterPro ? 1 : 0; - } - done++; - if (done % 1000000 == 0) { - System.err.println("FA " + done + " rows"); - } - } - if (!m.isEmpty()) { - writer.write(curPept, extracted(m, numProt, numAnnotatedGO, numAnnotatedEC, numAnnotatedInterPro)); - } - writer.close(); - } - - /** - * - * Output of the following form: - * { - * "num": { - * "all": 1, - * "EC": 1, - * "GO": 1, - * "IPR": 1 - * }, - * "data": { - * "GO:0016569": 1, - * "GO:0006281": 1, - * "GO:0000781": 1, - * "EC:2.7.11.1": 1, - * "GO:0004674": 1, - * "GO:0005634": 1, - * "GO:0005524": 1, - * "GO:0016301": 1 - * "IPR:IPR000001": 1 - * } - * } - * - * But without spacing: - * {"num":{"all":1,"EC":1,"GO":1,"IPR":1},"data":{"GO:0016569":1,"GO:0006281":1,"GO:0000781":1,"2.7.11.1":1,"GO:0004674":1,"GO:0005634":1,"GO:0005524":1,"GO:0016301":1,"IPR:IPR000001":1}} - */ - private static String extracted(Map m, int numProt, int numAnnotatedGO, int numAnnotatedEC, int numAnnotatedInterPro) { - StringBuilder sb = new StringBuilder(); - sb.append("{\"num\":{\"all\":"); - sb.append(numProt); - sb.append(",\"EC\":"); - sb.append(numAnnotatedEC); - sb.append(",\"GO\":"); - sb.append(numAnnotatedGO); - sb.append(",\"IPR\":"); - sb.append(numAnnotatedInterPro); - sb.append("},\"data\":{"); - sb.append(m.entrySet().stream().map(e -> { - return '"' + e.getKey() + "\":" + e.getValue(); - }).collect(Collectors.joining(","))); - sb.append("}}"); - return sb.toString(); - } - -}