Skip to content

Commit

Permalink
Merge pull request #30 from unipept/fix/empty-fa
Browse files Browse the repository at this point in the history
Provide fix for empty functional annotations in database
  • Loading branch information
pverscha authored Sep 27, 2023
2 parents 9c082ce + e5cf281 commit 01bdeed
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 43 deletions.
Binary file modified scripts/helper_scripts/TaxonsUniprots2Tables.jar
Binary file not shown.
56 changes: 14 additions & 42 deletions scripts/helper_scripts/parser/src/storage/TableWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ public class TableWriter implements UniprotObserver {
// csv files
private CSV.IndexedWriter peptides;
private CSV.IndexedWriter uniprotEntries;
private CSV.IndexedWriter refseqCrossReferences;
private CSV.IndexedWriter emblCrossReferences;
private CSV.IndexedWriter goCrossReferences;
private CSV.IndexedWriter ecCrossReferences;
private CSV.IndexedWriter interProCrossReferences;
Expand Down Expand Up @@ -80,19 +78,15 @@ public void store(UniprotEntry entry) {
long uniprotEntryId = addUniprotEntry(entry.getUniprotAccessionNumber(), entry.getVersion(),
entry.getTaxonId(), entry.getType(), entry.getName(), entry.getSequence());
if (uniprotEntryId != -1) { // failed to add entry

// todo make cleaner
String faSummary = Stream.of(
entry.getGOReferences().stream().map(UniprotGORef::getId),
entry.getECReferences().stream().map(x->"EC:"+x.getId()),
entry.getInterProReferences().stream().map(x->"IPR:"+x.getId())
entry.getECReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"EC:"+x.getId()),
entry.getInterProReferences().stream().filter(x -> !x.getId().isEmpty()).map(x->"IPR:"+x.getId())
).flatMap(i -> i).collect(Collectors.joining(";"));

for(String sequence : entry.digest()) {
addData(sequence.replace('I', 'L'), uniprotEntryId, sequence, faSummary);
}
for (UniprotDbRef ref : entry.getDbReferences())
addDbRef(ref, uniprotEntryId);
for (UniprotGORef ref : entry.getGOReferences())
addGORef(ref, uniprotEntryId);
for (UniprotECRef ref : entry.getECReferences())
Expand Down Expand Up @@ -124,12 +118,13 @@ public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxo
if(0 <= taxonId && taxonId < taxonList.size() && taxonList.get(taxonId) != null) {
try {
uniprotEntries.write(
uniprotAccessionNumber,
Integer.toString(version),
Integer.toString(taxonId),
type,
name,
sequence);
uniprotAccessionNumber,
Integer.toString(version),
Integer.toString(taxonId),
type,
name,
sequence
);
return uniprotEntries.index();
} catch(IOException e) {
System.err.println(new Timestamp(System.currentTimeMillis())
Expand Down Expand Up @@ -163,41 +158,18 @@ public long addUniprotEntry(String uniprotAccessionNumber, int version, int taxo
public void addData(String unifiedSequence, long uniprotEntryId, String originalSequence, String functionalAnnotations) {
try {
peptides.write(
unifiedSequence,
originalSequence,
Long.toString(uniprotEntryId),
functionalAnnotations
);
unifiedSequence,
originalSequence,
Long.toString(uniprotEntryId),
functionalAnnotations
);
} catch(IOException e) {
System.err.println(new Timestamp(System.currentTimeMillis())
+ " Error adding this peptide to the database: " + unifiedSequence);
e.printStackTrace();
}
}

/**
* Adds a uniprot entry cross reference to the database
*
* @param ref
* The uniprot cross reference to add
* @param uniprotEntryId
* The uniprotEntry of the cross reference
*/
public void addDbRef(UniprotDbRef ref, long uniprotEntryId) {
try {
CSV.Writer w = (ref.getType().equals("EMBL"))
? emblCrossReferences
: refseqCrossReferences;
w.write(Long.toString(uniprotEntryId),
ref.getProteinId(),
ref.getSequenceId());
} catch (IOException e) {
System.err.println(new Timestamp(System.currentTimeMillis())
+ " Error adding this cross reference to the database.");
e.printStackTrace();
}
}

/**
* Adds a uniprot entry GO reference to the database
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class NamesNodes2TaxonsLineages {
* Parse a list of taxons and their lineages from the NCBI dumps.
*
* This program will parse the first two argument files, and create the next
* two. The first two arguments are the nodes.dmp en names.dmp files
* two. The first two arguments are the nodes.dmp and names.dmp files
* downloaded from the NCBI. TSV-dumps of the parsed taxons and lineages
* will be written to the third and fourth parameter.
*/
Expand Down

0 comments on commit 01bdeed

Please sign in to comment.