diff --git a/src/main/scala/org/renci/chemotext/Main.scala b/src/main/scala/org/renci/chemotext/Main.scala index 86b0a4d..a1cac6e 100644 --- a/src/main/scala/org/renci/chemotext/Main.scala +++ b/src/main/scala/org/renci/chemotext/Main.scala @@ -120,6 +120,14 @@ object PubMedTripleGenerator extends LazyLogging { pmidIRI, RDF.`type`, ResourceFactory.createResource(s"$FaBiONamespace/Article") + ), + ( + // dct:isVersionOf + ResourceFactory.createStatement( + pmidIRI, + DCTerms.isVersionOf, + ResourceFactory.createResource(pubMedArticleWrapped.pmidIRI) + ) ) ) ++ ( // fabio:hasPublicationYear "2019"^xsd:gYear diff --git a/src/main/scala/org/renci/chemotext/PubMedArticleWrapper.scala b/src/main/scala/org/renci/chemotext/PubMedArticleWrapper.scala index 5028a78..d843702 100644 --- a/src/main/scala/org/renci/chemotext/PubMedArticleWrapper.scala +++ b/src/main/scala/org/renci/chemotext/PubMedArticleWrapper.scala @@ -71,6 +71,7 @@ object PubMedArticleWrapper { class PubMedArticleWrapper(val article: Node) { // The following methods extract particular fields from the wrapped PubMed article. val pmid: String = (article \ "MedlineCitation" \ "PMID").text + val version: String = (article \ "MedlineCitation" \ "PMID" \ "@Version").text val title: String = (article \\ "ArticleTitle").map(_.text).mkString(" ") val abstractText: String = (article \\ "AbstractText").map(_.text).mkString(" ") val journalNodes: NodeSeq = (article \\ "Article" \ "Journal") @@ -145,6 +146,10 @@ class PubMedArticleWrapper(val article: Node) { // Generate an IRI for this PubMedArticleWrapper. val iriAsString: String = { + val PMIDNamespace = "https://www.ncbi.nlm.nih.gov/pubmed" + s"$PMIDNamespace/$pmid.$version" + } + val pmidIRI: String = { val PMIDNamespace = "https://www.ncbi.nlm.nih.gov/pubmed" s"$PMIDNamespace/$pmid" } diff --git a/src/test/resources/pubmedXML/alternateVersions.xml b/src/test/resources/pubmedXML/alternateVersions.xml new file mode 100644 index 0000000..3cdf925 --- /dev/null +++ b/src/test/resources/pubmedXML/alternateVersions.xml @@ -0,0 +1,553 @@ + + + + + + 31431825 + + 2019 + 08 + 21 + +
+ + 2046-1402 + + 8 + + 2019 + + + F1000Research + F1000Res + + Introducing high school students to the Gene Ontology classification system. + + 241 + + 10.12688/f1000research.18061.1 + + We present an activity that introduces high school students to the Gene Ontology classification system which is widely used in genomics and systems biology studies to characterize large sets of genes based on functional and structural information. This is a valuable and standardized method used to identify genes that act in similar processes and pathways and also to gain insight into the overall architecture and distribution of genes and gene families associated with a particular tissue or disease. Through this exercise, students will learn how the classification system works by analyzing a list of genes using DAVID the Database for Annotation, Visualization and Integrated Discovery that incorporates the Gene Ontology system into its suite of analysis tools. This method of profiling genes is used by our high school student interns to categorize gene expression data related to behavioral neuroscience. Students will get a feel for working with genes and gene sets, gain vocabulary, obtain an understanding of how a database is structured and gain an awareness of the vast amount of information that is known about genes as well as the online analysis tools that are available. + + + + Dedhia + Mehek + M + + BioScience Project, Wakefield, MA, 01880, USA. + + + + Kohetuk + Kenneth + K + https://orcid.org/0000-0003-1503-5735 + + Saint Dominic Savio Catholic High School, Austin, TX, 78717, USA. + + + + Crusio + Wim E + WE + https://orcid.org/0000-0001-6638-202X + + Institut de Neurosciences Cognitives et Intégratives d'Aquitaine (UMR 5287), Pessac, France. + + + University of Bordeaux (UMR 5287), Pessac, France. + + + + Delprato + Anna + A + https://orcid.org/0000-0002-6134-2560 + + BioScience Project, Wakefield, MA, 01880, USA. + + + + eng + + + figshare + + 10.6084/m9.figshare.7649225.v1 + + + + + Journal Article + + + 2019 + 03 + 01 + +
+ + England + F1000Res + 101594320 + 2046-1402 + + IM + + gene ontology + genomics + high school students + + No competing interests were disclosed. +
+ + + + 2019 + 02 + 11 + + + 2019 + 8 + 22 + 6 + 0 + + + 2019 + 8 + 23 + 6 + 0 + + + 2019 + 8 + 23 + 6 + 0 + + + epublish + + 31431825 + 10.12688/f1000research.18061.1 + PMC6619382 + + +
+ + + 31431825 + + 2019 + 08 + 23 + +
+ + 2046-1402 + + 8 + + 2019 + + + F1000Research + F1000Res + + Introducing high school students to the Gene Ontology classification system. + + 241 + + 10.12688/f1000research.18061.2 + + We present a tutorial that introduces high school students to the Gene Ontology classification system which is widely used in genomics and systems biology studies to characterize large sets of genes based on functional and structural information. This classification system is a valuable and standardized method used to identify genes that act in similar processes and pathways and also provides insight into the overall architecture and distribution of genes and gene families associated with a particular tissue or disease. By means of this tutorial, students learn how the classification system works through analyzing a gene set using DAVID the Database for Annotation, Visualization and Integrated Discovery that incorporates the Gene Ontology system into its suite of analysis tools. This method of profiling genes is used by our high school student interns to categorize gene expression data related to behavioral neuroscience. Students will get a feel for working with genes and gene sets, acquire vocabulary, obtain an understanding of how a database is structured and gain an awareness of the vast amount of information that is known about genes as well as the online analysis tools to manage this information that is nowadays available. Based on survey responses, students intellectually benefit from learning about the Gene Ontology System and using the DAVID tools, they are better prepared for future database use and they also find it enjoyable. + + + + Dedhia + Mehek + M + + BioScience Project, Wakefield, MA, 01880, USA. + + + + Kohetuk + Kenneth + K + https://orcid.org/0000-0003-1503-5735 + + Saint Dominic Savio Catholic High School, Austin, TX, 78717, USA. + + + + Crusio + Wim E + WE + https://orcid.org/0000-0001-6638-202X + + Institut de Neurosciences Cognitives et Intégratives d'Aquitaine (UMR 5287), Pessac, France. + + + University of Bordeaux (UMR 5287), Pessac, France. + + + + Delprato + Anna + A + https://orcid.org/0000-0002-6134-2560 + + BioScience Project, Wakefield, MA, 01880, USA. + + + + eng + + + figshare + + 10.6084/m9.figshare.7649225.v1 + + + + + Journal Article + + + 2019 + 03 + 01 + +
+ + England + F1000Res + 101594320 + 2046-1402 + + IM + + gene ontology + genomics + high school students + + No competing interests were disclosed. +
+ + + + 2019 + 04 + 08 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + epublish + + 31431825 + 10.12688/f1000research.18061.2 + PMC6619382.2 + + +
+ + + 31431825 + + 2019 + 08 + 23 + +
+ + 2046-1402 + + 8 + + 2019 + + + F1000Research + F1000Res + + Introducing high school students to the Gene Ontology classification system. + + 241 + + 10.12688/f1000research.18061.3 + + We present a tutorial that introduces high school students to the Gene Ontology classification system which is widely used in genomics and systems biology studies to characterize large sets of genes based on functional and structural information. This classification system is a valuable and standardized method used to identify genes that act in similar processes and pathways and also provides insight into the overall architecture and distribution of genes and gene families associated with a particular tissue or disease. By means of this tutorial, students learn how the classification system works through analyzing a gene set using DAVID the Database for Annotation, Visualization and Integrated Discovery that incorporates the Gene Ontology system into its suite of analysis tools. This method of profiling genes is used by our high school student interns to categorize gene expression data related to behavioral neuroscience. Students will get a feel for working with genes and gene sets, acquire vocabulary, obtain an understanding of how a database is structured and gain an awareness of the vast amount of information that is known about genes as well as the online analysis tools to manage this information that is nowadays available. Based on survey responses, students intellectually benefit from learning about the Gene Ontology System and using the DAVID tools, they are better prepared for future database use and they also find it enjoyable. + + + + Dedhia + Mehek + M + + BioScience Project, Wakefield, MA, 01880, USA. + + + + Kohetuk + Kenneth + K + https://orcid.org/0000-0003-1503-5735 + + Saint Dominic Savio Catholic High School, Austin, TX, 78717, USA. + + + + Crusio + Wim E + WE + https://orcid.org/0000-0001-6638-202X + + Institut de Neurosciences Cognitives et Intégratives d'Aquitaine (UMR 5287), Pessac, France. + + + University of Bordeaux (UMR 5287), Pessac, France. + + + + Delprato + Anna + A + https://orcid.org/0000-0002-6134-2560 + + BioScience Project, Wakefield, MA, 01880, USA. + + + + eng + + + figshare + + 10.6084/m9.figshare.8166611.v1 + + + + + Journal Article + + + 2019 + 03 + 01 + +
+ + England + F1000Res + 101594320 + 2046-1402 + + IM + + gene ontology + genomics + high school students + + No competing interests were disclosed. +
+ + + + 2019 + 05 + 22 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + epublish + + 31431825 + 10.12688/f1000research.18061.3 + PMC6619382.3 + + +
+ + + 31431825 + + 2019 + 08 + 23 + +
+ + 2046-1402 + + 8 + + 2019 + + + F1000Research + F1000Res + + Introducing high school students to the Gene Ontology classification system. + + 241 + + 10.12688/f1000research.18061.4 + + We present a tutorial that introduces high school students to the Gene Ontology classification system which is widely used in genomics and systems biology studies to characterize large sets of genes based on functional and structural information. This classification system is a valuable and standardized method used to identify genes that act in similar processes and pathways and also provides insight into the overall architecture and distribution of genes and gene families associated with a particular tissue or disease. By means of this tutorial, students learn how the classification system works through analyzing a gene set using DAVID the Database for Annotation, Visualization and Integrated Discovery that incorporates the Gene Ontology system into its suite of analysis tools. This method of analyzing genes is used by our high school student interns to categorize gene expression data related to behavioral neuroscience. Students will get a feel for working with genes and gene sets, acquire vocabulary, obtain an understanding of how a database is structured and gain an awareness of the vast amount of information that is known about genes as well as the online analysis tools to manage this information that is nowadays available. Based on survey responses, students intellectually benefit from learning about the Gene Ontology System and using the DAVID tools, they are better prepared for future database use and they also find it enjoyable. + + + + Dedhia + Mehek + M + + BioScience Project, Wakefield, MA, 01880, USA. + + + + Kohetuk + Kenneth + K + https://orcid.org/0000-0003-1503-5735 + + Saint Dominic Savio Catholic High School, Austin, TX, 78717, USA. + + + + Crusio + Wim E + WE + https://orcid.org/0000-0001-6638-202X + + Institut de Neurosciences Cognitives et Intégratives d'Aquitaine (UMR 5287), Pessac, France. + + + University of Bordeaux (UMR 5287), Pessac, France. + + + + Delprato + Anna + A + https://orcid.org/0000-0002-6134-2560 + + BioScience Project, Wakefield, MA, 01880, USA. + + + + eng + + + figshare + + 10.6084/m9.figshare.8166611.v1 + + + + + Journal Article + + + 2019 + 03 + 01 + +
+ + England + F1000Res + 101594320 + 2046-1402 + + IM + + gene ontology + genomics + high school students + + No competing interests were disclosed. +
+ + + + 2019 + 07 + 31 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + 2019 + 8 + 24 + 6 + 0 + + + epublish + + 31431825 + 10.12688/f1000research.18061.4 + PMC6619382 + + +
+
diff --git a/src/test/scala/org/renci/chemotext/OmnicorpTests.scala b/src/test/scala/org/renci/chemotext/OmnicorpTests.scala index 738bf66..2c2d797 100644 --- a/src/test/scala/org/renci/chemotext/OmnicorpTests.scala +++ b/src/test/scala/org/renci/chemotext/OmnicorpTests.scala @@ -3,10 +3,13 @@ package org.renci.chemotext import java.io.File import java.nio.file.Files +import org.apache.jena.rdf.model.{ModelFactory, Resource} +import org.apache.jena.vocabulary.RDF import utest._ import sys.process._ import scala.util.matching.Regex +import collection.JavaConverters._ /** * Tests for the entire Omnicorp application. @@ -31,7 +34,7 @@ object OmnicorpTests extends TestSuite { val tests: Tests = Tests { test("Make sure we can run Omnicorp and see runtime information") { - val (status, stdout, stderr) = exec(Seq("sbt", "run")) + val (status, stdout, _) = exec(Seq("sbt", "run")) assert(status == 1) assert(stdout contains "Omnicorp requires four arguments:") assert(stdout contains "Nonzero exit code returned from runner: 2") @@ -42,7 +45,7 @@ object OmnicorpTests extends TestSuite { val tmpFolder = Files.createTempDirectory("omnicorp-testing").toFile test("Make sure we can execute Omnicorp on the example file") { - val (status, stdout, stderr) = + val (status, stdout, _) = exec(Seq("sbt", s"""run none "$examplesForTests" "$tmpFolder" 1""")) // Clean up temporary folder. @@ -65,7 +68,7 @@ object OmnicorpTests extends TestSuite { val tmpFolder = Files.createTempDirectory("omnicorp-testing").toFile test("Make sure we get a warning message on executing Omnicorp on this example file") { - val (status, stdout, stderr) = + val (status, stdout, _) = exec(Seq("sbt", s"""run none "$failedExamples1" "$tmpFolder" 1""")) // Clean up temporary folder. @@ -81,10 +84,53 @@ object OmnicorpTests extends TestSuite { assert(stdout contains "Begin processing") assert(stdout contains "WARN org.renci.chemotext.PubMedTripleGenerator") assert( - stdout contains "Unable to parse date http://purl.org/dc/terms/issued on https://www.ncbi.nlm.nih.gov/pubmed/10542500: Could not parse XML node as date: Dec-Jan" + stdout contains "Unable to parse date http://purl.org/dc/terms/issued on https://www.ncbi.nlm.nih.gov/pubmed/10542500.1: Could not parse XML node as date: Dec-Jan" ) assert(!finalResultRegex.findFirstIn(stdout).isEmpty) } } + + test("On input file alternateVersions.xml containing alternate versions") { + val failedExamples1 = getClass.getResource("/pubmedXML/alternateVersions.xml").getPath + val tmpFolder = Files.createTempDirectory("omnicorp-testing").toFile + + test("Make sure we have all four versions") { + val (status, stdout, _) = + exec(Seq("sbt", s"""run none "$failedExamples1" "$tmpFolder" 1""")) + + // Test output and errors. + assert(status == 0) + assert(stdout contains "Total time:") + assert(stdout contains "completed") + + assert(stdout contains "Begin processing") + assert(stdout contains "INFO org.renci.chemotext.Main$") + assert( + stdout matches """.*Took \d+ seconds \([\w\.]+\) to create approx \d+ triples from \d+ articles.*""" + ) + + // Load temporary file and make sure we have all four versions. + val outputFile = new File(tmpFolder, "alternateVersions.xml.ttl") + val model = ModelFactory.createDefaultModel() + model.read(outputFile.toURI.toString) + + val articleClass = model.createResource("http://purl.org/spar/fabio/Article"); + val articles: Seq[Resource] = + model.listResourcesWithProperty(RDF.`type`, articleClass).toList.asScala + assert(articles.size == 4) + assert( + articles.map(_.getURI).toSet == Set( + "https://www.ncbi.nlm.nih.gov/pubmed/31431825.1", + "https://www.ncbi.nlm.nih.gov/pubmed/31431825.2", + "https://www.ncbi.nlm.nih.gov/pubmed/31431825.3", + "https://www.ncbi.nlm.nih.gov/pubmed/31431825.4" + ) + ) + + // Clean up temporary folder. + outputFile.delete() + tmpFolder.delete() + } + } } } diff --git a/src/test/scala/org/renci/chemotext/PubMedArticleWrapperIntegrationTests.scala b/src/test/scala/org/renci/chemotext/PubMedArticleWrapperIntegrationTests.scala index 36a9b1f..c62e603 100644 --- a/src/test/scala/org/renci/chemotext/PubMedArticleWrapperIntegrationTests.scala +++ b/src/test/scala/org/renci/chemotext/PubMedArticleWrapperIntegrationTests.scala @@ -101,8 +101,9 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { assert( summarizedTriples == Map( - "https://www.ncbi.nlm.nih.gov/pubmed/11237011" -> Map( + "https://www.ncbi.nlm.nih.gov/pubmed/11237011.1" -> Map( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -> Map("URI" -> 1), + "http://purl.org/dc/terms/isVersionOf" -> Map("URI" -> 1), "http://purl.org/spar/fabio/hasPublicationYear" -> Map( "http://www.w3.org/2001/XMLSchema#gYear" -> 1 ), @@ -192,8 +193,9 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { "http://www.w3.org/2001/XMLSchema#string" -> 1 ) ), - "https://www.ncbi.nlm.nih.gov/pubmed/17060194" -> Map( + "https://www.ncbi.nlm.nih.gov/pubmed/17060194.1" -> Map( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -> Map("URI" -> 1), + "http://purl.org/dc/terms/isVersionOf" -> Map("URI" -> 1), "http://purl.org/spar/fabio/hasPublicationYear" -> Map( "http://www.w3.org/2001/XMLSchema#gYear" -> 1 ), @@ -262,7 +264,8 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { foaf:givenName "Gaurav" ; foaf:name "Gaurav Vaidya Jr" . - a fabio:Article ; + a fabio:Article ; + dct:isVersionOf ; prism:doi "10.1080/10635150600969864" ; prism:endingPage "28" ; prism:pageRange "715-28" ; @@ -335,8 +338,9 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { val summarizedTriples = summarizeTriples(triples) assert( summarizedTriples == Map( - "https://www.ncbi.nlm.nih.gov/pubmed/22859891" -> Map( + "https://www.ncbi.nlm.nih.gov/pubmed/22859891.1" -> Map( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -> Map("URI" -> 1), + "http://purl.org/dc/terms/isVersionOf" -> Map("URI" -> 1), "http://purl.org/spar/fabio/hasPublicationYear" -> Map( "http://www.w3.org/2001/XMLSchema#gYear" -> 1 ), @@ -394,8 +398,9 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { summarizeTriples(PubMedTripleGenerator.generateTriples(wrappedArticle, None)) assert( summarizedTriples == Map( - "https://www.ncbi.nlm.nih.gov/pubmed/10542500" -> Map( + "https://www.ncbi.nlm.nih.gov/pubmed/10542500.1" -> Map( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -> Map("URI" -> 1), + "http://purl.org/dc/terms/isVersionOf" -> Map("URI" -> 1), "http://purl.org/spar/fabio/hasPublicationYear" -> Map( "http://www.w3.org/2001/XMLSchema#gYear" -> 1 ), @@ -446,9 +451,10 @@ object PubMedArticleWrapperIntegrationTests extends TestSuite { assert( summarizedTriples == Map( - "https://www.ncbi.nlm.nih.gov/pubmed/15517475" -> Map( - "http://purl.org/dc/terms/issued" -> Map("http://www.w3.org/2001/XMLSchema#date" -> 1), - "http://purl.org/dc/terms/title" -> Map("http://www.w3.org/2001/XMLSchema#string" -> 1), + "https://www.ncbi.nlm.nih.gov/pubmed/15517475.1" -> Map( + "http://purl.org/dc/terms/isVersionOf" -> Map("URI" -> 1), + "http://purl.org/dc/terms/issued" -> Map("http://www.w3.org/2001/XMLSchema#date" -> 1), + "http://purl.org/dc/terms/title" -> Map("http://www.w3.org/2001/XMLSchema#string" -> 1), "http://purl.org/dc/terms/modified" -> Map( "http://www.w3.org/2001/XMLSchema#date" -> 1 ),