From ef76200afac29b34962d125835bf7b99c60dd22b Mon Sep 17 00:00:00 2001 From: Daniel Shapiro Date: Tue, 23 Apr 2019 21:32:06 -0700 Subject: [PATCH] 1.0.3 improvements to display of output --- build.sbt | 2 +- src/main/scala/cladograms/Cladogram.scala | 2 +- .../scala/cladograms/EnWikipediaClade.scala | 21 +++++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/build.sbt b/build.sbt index 29bd8da..b4ae614 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "wikiClade" -version := "1.0.2" +version := "1.0.3" scalaVersion := "2.12.8" diff --git a/src/main/scala/cladograms/Cladogram.scala b/src/main/scala/cladograms/Cladogram.scala index 14add62..60978eb 100644 --- a/src/main/scala/cladograms/Cladogram.scala +++ b/src/main/scala/cladograms/Cladogram.scala @@ -34,7 +34,7 @@ class Cladogram (val clade: Clade, var children: Set[Cladogram]) { def prominentDescendants(verbosity: Int): Set[Cladogram] = for { child <- children descendant = - if (child.children.size == 1 && !child.children.head.clade.shouldDisplay(verbosity)) + if (child.children.size == 1 && !child.clade.shouldDisplay(verbosity)) child.prominentDescendants(verbosity).head //guaranteed to have exactly one element else child } yield descendant diff --git a/src/main/scala/cladograms/EnWikipediaClade.scala b/src/main/scala/cladograms/EnWikipediaClade.scala index 4df3f08..7c13361 100644 --- a/src/main/scala/cladograms/EnWikipediaClade.scala +++ b/src/main/scala/cladograms/EnWikipediaClade.scala @@ -5,22 +5,25 @@ import org.jsoup.nodes.{Document, Element} import org.jsoup.select.Elements import scala.util.{Failure, Success, Try} - /** * Created by Daniel on 4/8/2019. */ case class EnWikipediaClade(val name: String, val path: Option[String], val priorityOverride: Double = 100) extends Clade { val baseUrl = "https://en.wikipedia.org" + val ignorableCladeTypes = Set("Clade", "(unranked)") + val importantCladeTypes = Set("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species") lazy val meta: WikiCladeMetadata = getMeta def ancestors: List[Clade] = meta.ancestors - def priority: Double = Math.min(priorityOverride, meta.docPriority) + def priority: Double = Math.min( + Math.min(priorityOverride, meta.docPriority), + if (importantCladeTypes contains meta.cladeType) 20 else 100) override def shouldDisplay(verbosity: Int): Boolean = priority <= verbosity override def DOTDefinition: Option[String] = { - val cladeTypeStr = if (meta.cladeType.isEmpty) "" else meta.cladeType + "
" + val cladeTypeStr = if (meta.cladeType.isEmpty) "" else s"""${meta.cladeType}
""" Some(s""""$name" [label=<$cladeTypeStr$name>]""") } @@ -37,7 +40,7 @@ case class EnWikipediaClade(val name: String, val path: Option[String], val prio else new EnWikipediaClade(details.name, Some(details.path)) } val docPriority = priorityBasedOnDoc(docOpt) - WikiCladeMetadata(ancestors, cladeType, docPriority) + WikiCladeMetadata(ancestors, sanitizeCladeType(cladeType), docPriority) } private def getDoc: Option[Document] = path match { @@ -92,15 +95,10 @@ case class EnWikipediaClade(val name: String, val path: Option[String], val prio val details = parseRow(row) if (details.path.nonEmpty) { val pagetry = Try(Jsoup.connect(baseUrl + details.path).get().select("title").text()) -// val pagetry = doctry match { -// case Success(doc) => Try(doc.select("title").text()) -// case Failure(e) => Failure(e) -// } pagetry match { case Success(page) => if (knownPages contains page) { iter(i + 1, started, knownPages, TaxonDetails(details.name, details.cladeType, "") :: taxList) } else { - //val pri = priorityBasedOnDoc(doctry.get) iter(i + 1, started, knownPages + page, details :: taxList) } case Failure(_) => @@ -116,6 +114,11 @@ case class EnWikipediaClade(val name: String, val path: Option[String], val prio iter(0, false, Set(), List()) } + private def sanitizeCladeType(cladeType: String): String = { + val cleaned = cladeType.replaceAll(":", "").trim + if (ignorableCladeTypes contains cleaned) "" else cleaned + } + def priorityBasedOnDoc(docOpt: Option[Document]): Double = docOpt match { case Some(doc) => Math.min (99, Math.max (1, 100 - (15 * (Math.log (doc.text ().length) - 7) ) ) ) case None => 99