Merge pull request #30 from unipept/fix/empty-fa

Provide fix for empty functional annotations in database
unipept · Sep 27, 2023 · 01bdeed · 01bdeed
2 parents 9c082ce + e5cf281
commit 01bdeed
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 43 deletions.
diff --git a/scripts/helper_scripts/TaxonsUniprots2Tables.jar b/scripts/helper_scripts/TaxonsUniprots2Tables.jar
diff --git a/scripts/helper_scripts/parser/src/storage/TableWriter.java b/scripts/helper_scripts/parser/src/storage/TableWriter.java
@@ -41,8 +41,6 @@ public class TableWriter implements UniprotObserver {
     // csv files
     private CSV.IndexedWriter peptides;
     private CSV.IndexedWriter uniprotEntries;
-    private CSV.IndexedWriter refseqCrossReferences;
-    private CSV.IndexedWriter emblCrossReferences;
     private CSV.IndexedWriter goCrossReferences;
     private CSV.IndexedWriter ecCrossReferences;
     private CSV.IndexedWriter interProCrossReferences;
@@ -80,19 +78,15 @@ public void store(UniprotEntry entry) {
         long uniprotEntryId = addUniprotEntry(entry.getUniprotAccessionNumber(), entry.getVersion(),
                 entry.getTaxonId(), entry.getType(), entry.getName(), entry.getSequence());
         if (uniprotEntryId != -1) { // failed to add entry
-
-            // todo make cleaner
             String faSummary = Stream.of(
                     entry.getGOReferences().stream().map(UniprotGORef::getId),
-                    entry.getECReferences().stream().map(x->"EC:"+x.getId()),
-                    entry.getInterProReferences().stream().map(x->"IPR:"+x.getId())
+                    entry.getECReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"EC:"+x.getId()),
+                    entry.getInterProReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"IPR:"+x.getId())
             ).flatMap(i -> i).collect(Collectors.joining(";"));
 
             for(String sequence : entry.digest()) {
                 addData(sequence.replace('I', 'L'), uniprotEntryId, sequence, faSummary);
             }
-            for (UniprotDbRef ref : entry.getDbReferences())
-                addDbRef(ref, uniprotEntryId);
             for (UniprotGORef ref : entry.getGOReferences())
                 addGORef(ref, uniprotEntryId);
             for (UniprotECRef ref : entry.getECReferences())
@@ -124,12 +118,13 @@ public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxo
         if(0 <= taxonId && taxonId < taxonList.size() && taxonList.get(taxonId) != null) {
             try {
                 uniprotEntries.write(
-                        uniprotAccessionNumber,
-                        Integer.toString(version),
-                        Integer.toString(taxonId),
-                        type,
-                        name,
-                        sequence);
+                    uniprotAccessionNumber,
+                    Integer.toString(version),
+                    Integer.toString(taxonId),
+                    type,
+                    name,
+                    sequence
+                );
                 return uniprotEntries.index();
             } catch(IOException e) {
                 System.err.println(new Timestamp(System.currentTimeMillis())
@@ -163,41 +158,18 @@ public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxo
     public void addData(String unifiedSequence, long uniprotEntryId, String originalSequence, String functionalAnnotations) {
         try {
             peptides.write(
-                    unifiedSequence,
-                    originalSequence,
-                    Long.toString(uniprotEntryId),
-                    functionalAnnotations
-                    );
+                unifiedSequence,
+                originalSequence,
+                Long.toString(uniprotEntryId),
+                functionalAnnotations
+            );
         } catch(IOException e) {
             System.err.println(new Timestamp(System.currentTimeMillis())
                     + " Error adding this peptide to the database: " + unifiedSequence);
             e.printStackTrace();
         }
     }
 
-    /**
-     * Adds a uniprot entry cross reference to the database
-     *
-     * @param ref
-     *            The uniprot cross reference to add
-     * @param uniprotEntryId
-     *            The uniprotEntry of the cross reference
-     */
-    public void addDbRef(UniprotDbRef ref, long uniprotEntryId) {
-        try {
-            CSV.Writer w = (ref.getType().equals("EMBL"))
-                ? emblCrossReferences
-                : refseqCrossReferences;
-            w.write(Long.toString(uniprotEntryId),
-                    ref.getProteinId(),
-                    ref.getSequenceId());
-        } catch (IOException e) {
-            System.err.println(new Timestamp(System.currentTimeMillis())
-                    + " Error adding this cross reference to the database.");
-            e.printStackTrace();
-        }
-    }
-
     /**
      * Adds a uniprot entry GO reference to the database
      *

diff --git a/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java b/scripts/helper_scripts/parser/src/tools/NamesNodes2TaxonsLineages.java
@@ -18,7 +18,7 @@ public class NamesNodes2TaxonsLineages {
      * Parse a list of taxons and their lineages from the NCBI dumps.
      *
      * This program will parse the first two argument files, and create the next
-     * two. The first two arguments are the nodes.dmp en names.dmp files
+     * two. The first two arguments are the nodes.dmp and names.dmp files
      * downloaded from the NCBI. TSV-dumps of the parsed taxons and lineages
      * will be written to the third and fourth parameter.
      */