From c31514bf26ef0d007bea0555a64719cd4470b3c6 Mon Sep 17 00:00:00 2001 From: Demian Katz Date: Fri, 10 Nov 2023 13:22:31 -0500 Subject: [PATCH] Add DOAJ XML support. (#149) Thanks, @demiankatz! --- conf/datasources.ini.sample | 7 + src/RecordManager/Base/Record/Doaj.php | 336 ++++++++++++++++++ .../Base/config/module.config.php | 2 + 3 files changed, 345 insertions(+) create mode 100644 src/RecordManager/Base/Record/Doaj.php diff --git a/conf/datasources.ini.sample b/conf/datasources.ini.sample index 64e3fb69f..1e1284f9c 100644 --- a/conf/datasources.ini.sample +++ b/conf/datasources.ini.sample @@ -232,3 +232,10 @@ ;institution = GeniePlusLib ;format = marc ;dedup = false + +; Sample configuration for DOAJ open access articles +;[DOAJ] +;url = https://doaj.org/oai.article +;metadataPrefix = oai_doaj +;format = doaj +;institution = DOAJ diff --git a/src/RecordManager/Base/Record/Doaj.php b/src/RecordManager/Base/Record/Doaj.php new file mode 100644 index 000000000..0ca1864ba --- /dev/null +++ b/src/RecordManager/Base/Record/Doaj.php @@ -0,0 +1,336 @@ + + * @author Demian Katz + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link https://github.com/NatLibFi/RecordManager + */ + +namespace RecordManager\Base\Record; + +use RecordManager\Base\Database\DatabaseInterface as Database; +use RecordManager\Base\Http\ClientManager as HttpClientManager; +use RecordManager\Base\Utils\Logger; +use RecordManager\Base\Utils\MetadataUtils; + +/** + * DOAJ record class + * + * This is a class for processing Dublin Core records. + * + * @category DataManagement + * @package RecordManager + * @author Ere Maijala + * @author Demian Katz + * @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License + * @link https://github.com/NatLibFi/RecordManager + */ +class Doaj extends AbstractRecord +{ + use XmlRecordTrait { + XmlRecordTrait::setData as XmlTraitSetData; + } + use FullTextTrait; + + /** + * Document + * + * @var \SimpleXMLElement + */ + protected $doc = null; + + /** + * HTTP client manager for FullTextTrait + * + * @var HttpClientManager + */ + protected $httpClientManager; + + /** + * Database for FullTextTrait + * + * @var ?Database + */ + protected $db; + + /** + * Record namespace identifier + * + * @var string + */ + protected $recordNs = 'http://doaj.org/features/oai_doaj/1.0/'; + + /** + * Constructor + * + * @param array $config Main configuration + * @param array $dataSourceConfig Data source settings + * @param Logger $logger Logger + * @param MetadataUtils $metadataUtils Metadata utilities + * @param HttpClientManager $httpManager HTTP client manager + * @param ?Database $db Database + */ + public function __construct( + $config, + $dataSourceConfig, + Logger $logger, + MetadataUtils $metadataUtils, + HttpClientManager $httpManager, + Database $db = null + ) { + parent::__construct($config, $dataSourceConfig, $logger, $metadataUtils); + $this->httpClientManager = $httpManager; + $this->db = $db; + } + + /** + * Set record data + * + * @param string $source Source ID + * @param string $oaiID Record ID received from OAI-PMH (or empty string for + * file import) + * @param string $data Metadata + * + * @return void + */ + public function setData($source, $oaiID, $data) + { + $this->XmlTraitSetData($source, $oaiID, $data); + + if ( + empty($this->doc->recordID) + && empty($this->doc->children($this->recordNs)->recordID) + ) { + $parts = explode(':', $oaiID); + $id = ('oai' === $parts[0] && !empty($parts[2])) ? $parts[2] : $oaiID; + $this->doc->addChild('recordID', $id); + } + } + + /** + * Return record ID (local) + * + * @return string + */ + public function getID() + { + $id = (string)$this->doc->recordID[0]; + if ('' === $id) { + $id = (string)$this->doc->children($this->recordNs)->recordID[0]; + } + return $id; + } + + /** + * Return fields to be indexed in Solr + * + * @param Database $db Database connection. Omit to avoid database lookups for + * related records. + * + * @return array + */ + public function toSolrArray(Database $db = null) + { + $data = $this->getFullTextFields($this->doc); + + $doc = $this->doc->children($this->recordNs); + $data['record_format'] = 'doaj'; + $data['ctrlnum'] = $this->getID(); + $data['fullrecord'] = $doc->asXML(); + + // allfields + $allFields = []; + foreach ($doc as $field) { + $allFields[] = $this->metadataUtils->stripTrailingPunctuation( + trim((string)$field) + ); + } + $data['allfields'] = $allFields; + + // language + $languages = []; + foreach (explode(' ', trim((string)$doc->language)) as $language) { + foreach (str_split($language, 3) as $code) { + $languages[] = $code; + } + } + $data['language'] = $this->metadataUtils + ->normalizeLanguageStrings($languages); + + $data['format'] = $this->getFormat(); + + $getAuthor = function ($xml) { + return (string)($xml->author->name ?? ''); + }; + $data['author'] = array_filter( + array_values( + array_map($getAuthor, iterator_to_array($doc->authors)) + ) + ); + + $data['title'] = $data['title_full'] = $this->getTitle(); + $titleParts = explode(' : ', $data['title'], 2); + $data['title_short'] = $titleParts[0]; + if (isset($titleParts[1])) { + $data['title_sub'] = $titleParts[1]; + } + $data['title_sort'] = $this->getTitle(true); + + $data['publisher'] = [ + $this->metadataUtils->stripTrailingPunctuation( + trim((string)$doc->publisher) + ), + ]; + $data['publishDate'] = $this->getPublicationYear(); + + $getTopic = function ($xml) { + return (string)($xml->keyword ?? ''); + }; + $data['topic'] = $data['topic_facet'] = array_filter( + array_values( + array_map($getTopic, iterator_to_array($doc->keywords)) + ) + ); + + $data['url'] = $doc->fullTextUrl; + + return $data; + } + + /** + * Dedup: Return full title (for debugging purposes only) + * + * @return string + */ + public function getFullTitleForDebugging() + { + return trim((string)$this->doc->children($this->recordNs)->title); + } + + /** + * Dedup: Return record title + * + * @param bool $forFiling Whether the title is to be used in filing + * (e.g. sorting, non-filing characters should be removed) + * + * @return string + */ + public function getTitle($forFiling = false) + { + $title = trim((string)$this->doc->children($this->recordNs)->title); + if ($forFiling) { + $title = $this->metadataUtils->createSortTitle($title); + } else { + $title + = $this->metadataUtils->stripTrailingPunctuation($title, '', true); + } + return $title; + } + + /** + * Return main author (format: Last, First) + * + * @return string + */ + public function getMainAuthor() + { + return trim((string)($this->doc->children($this->recordNs)?->authors?->author?->name ?? '')); + } + + /** + * Dedup: Return ISBNs in ISBN-13 format without dashes + * + * @return array + */ + public function getISBNs() + { + return []; + } + + /** + * Dedup: Return series ISSN + * + * @return string + */ + public function getSeriesISSN() + { + return ''; + } + + /** + * Dedup: Return series numbering + * + * @return string + */ + public function getSeriesNumbering() + { + return ''; + } + + /** + * Dedup: Return format from predefined values + * + * @return string|array + */ + public function getFormat() + { + return 'Article'; + } + + /** + * Dedup: Return publication year (four digits only) + * + * @return string + */ + public function getPublicationYear() + { + $date = trim((string)$this->doc->children($this->recordNs)->publicationDate); + $date = substr($date, 0, 4); + if (preg_match('{^(\d{4})$}', $date)) { + return $date; + } + return ''; + } + + /** + * Dedup: Return page count (number only) + * + * @return string + */ + public function getPageCount() + { + return ''; + } + + /** + * Get DOIs + * + * @return array + */ + protected function getDOIs(): array + { + return []; + } +} diff --git a/src/RecordManager/Base/config/module.config.php b/src/RecordManager/Base/config/module.config.php index 0742a0d2e..86713faba 100644 --- a/src/RecordManager/Base/config/module.config.php +++ b/src/RecordManager/Base/config/module.config.php @@ -124,6 +124,7 @@ 'record' => [ 'factories' => [ \RecordManager\Base\Record\Dc::class => \RecordManager\Base\Record\AbstractRecordWithHttpAndDbFactory::class, + \RecordManager\Base\Record\Doaj::class => \RecordManager\Base\Record\AbstractRecordWithHttpAndDbFactory::class, \RecordManager\Base\Record\Eaccpf::class => \RecordManager\Base\Record\AbstractRecordFactory::class, \RecordManager\Base\Record\Ead::class => \RecordManager\Base\Record\AbstractRecordFactory::class, \RecordManager\Base\Record\Ead3::class => \RecordManager\Base\Record\AbstractRecordFactory::class, @@ -138,6 +139,7 @@ ], 'aliases' => [ 'dc' => \RecordManager\Base\Record\Dc::class, + 'doaj' => \RecordManager\Base\Record\Doaj::class, 'eaccpf' => \RecordManager\Base\Record\Eaccpf::class, 'ead' => \RecordManager\Base\Record\Ead::class, 'ead3' => \RecordManager\Base\Record\Ead3::class,