From 7558bc3032a09964d764babbcc77380632e11d36 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Tue, 29 Oct 2024 13:14:53 -0400 Subject: [PATCH] DEV-1373 make catalog indexing date independent (#52) - Add `CICTL::Journal` class for writing dated files to a predefined location to record Zephir files indexed. - Add `journal_directory` to `Services` with `ENV`-overridable default location for journal files. - Add `cictl continue` command that calls `cictl all` or `cictl since` depending on presence or absence of relevant jourmals. - TIDY: remove deprecated docker-compose.yml version. - Address a number of nokogiri/rexml vulnerabilities identified by Dependabot. - Address #50 availability maps should account for icus. - Remove `standardrb` exception for `lib/translation_maps` (mostly) by changing single to double quotes. - Remove `ht_namespace_map` and unused reference to it. - Remove unused umich translation maps. - Run many cictl tests in temp directory with `around` block. - Add `CICTL::Examples` helpers to tidy up test setup. --- .standard.yml | 1 - Gemfile | 1 + Gemfile.lock | 10 +- README.md | 14 +- docker-compose.yml | 1 - lib/cictl/index_command.rb | 37 +- lib/cictl/journal.rb | 66 ++ lib/ht_traject/ht_item.rb | 5 - lib/services.rb | 8 + .../ht/availability_map_ht.rb | 33 +- .../ht/availability_map_ht_intl.rb | 31 +- lib/translation_maps/ht/country_map.rb | 634 +++++++++--------- lib/translation_maps/ht/ht_namespace_map.rb | 34 - .../umich/availability_map_umich.rb | 10 - lib/translation_maps/umich/institution_map.rb | 8 - spec/cictl/delete_command_spec.rb | 25 +- spec/cictl/index_command_spec.rb | 54 +- spec/cictl/journal_spec.rb | 46 ++ spec/cictl/logger_factory_spec.rb | 26 +- spec/examples.rb | 16 +- spec/spec_helper.rb | 84 ++- .../availability_map_ht_intl_spec.rb | 54 ++ .../availability_map_ht_spec.rb | 54 ++ 23 files changed, 783 insertions(+), 469 deletions(-) create mode 100644 lib/cictl/journal.rb delete mode 100644 lib/translation_maps/ht/ht_namespace_map.rb delete mode 100644 lib/translation_maps/umich/availability_map_umich.rb delete mode 100644 lib/translation_maps/umich/institution_map.rb create mode 100644 spec/cictl/journal_spec.rb create mode 100644 spec/translation_maps/availability_map_ht_intl_spec.rb create mode 100644 spec/translation_maps/availability_map_ht_spec.rb diff --git a/.standard.yml b/.standard.yml index 9348dda..822791e 100644 --- a/.standard.yml +++ b/.standard.yml @@ -5,7 +5,6 @@ ignore: - 'lib/ht_traject/**/*' - 'lib/ht_traject.rb' - 'lib/traject/**/*' - - 'lib/translation_maps/**/*' - 'lib/umich_traject/**/*' - 'lib/umich_traject.rb' - 'readers/**/*' diff --git a/Gemfile b/Gemfile index c439058..ceaab72 100644 --- a/Gemfile +++ b/Gemfile @@ -2,6 +2,7 @@ source "https://rubygems.org" group :development, :test do gem "bundler", "~>2.0" + gem "climate_control" gem "rake", "~> 13.0" gem "standard" gem "rspec" diff --git a/Gemfile.lock b/Gemfile.lock index 4c027d0..7873654 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -6,6 +6,7 @@ GEM ast (2.4.2) builder (3.2.4) canister (0.9.2) + climate_control (1.2.0) coderay (1.1.3) concurrent-ruby (1.2.2) date_named_file (0.1.1) @@ -61,11 +62,11 @@ GEM match_map (3.0.0) method_source (1.0.0) naconormalizer (1.0.1-java) - nokogiri (1.16.2-arm64-darwin) + nokogiri (1.16.7-arm64-darwin) racc (~> 1.4) - nokogiri (1.16.2-java) + nokogiri (1.16.7-java) racc (~> 1.4) - nokogiri (1.16.2-x86_64-linux) + nokogiri (1.16.7-x86_64-linux) racc (~> 1.4) parallel (1.23.0) parser (3.2.2.1) @@ -83,7 +84,7 @@ GEM rainbow (3.1.1) rake (13.0.6) regexp_parser (2.8.0) - rexml (3.2.5) + rexml (3.3.8) rsolr (2.5.0) builder (>= 2.1.2) faraday (>= 0.9, < 3, != 2.0.0) @@ -178,6 +179,7 @@ PLATFORMS DEPENDENCIES bundler (~> 2.0) canister (~> 0.9.2) + climate_control date_named_file dotenv http (~> 5.0) diff --git a/README.md b/README.md index 3c7f5dc..0a3cd22 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,16 @@ network (i.e. the one started with `docker-compose up` from this repository). Solr should be reachable via the `solr-sdr-catalog` hostname. ## How to do the basics +### Date-Independent Indexing +For use in production environments where daily and monthly indexing are ongoing activities, +we enable the indexer to maintain state by writing "journal" files: empty datestamped +files in a known location (`JOURNAL_DIRECTORY`). The command `cictl index continue` does whatever +full or daily indexing is appropriate given the state of the journals. + +Note that all of the `cictl index *` commands write journal files, with the exception of +`cictl index file` which takes only an `upd` MARC file rather than a MARC-deletes pair, and is not +expected to be used in an environment where date independence is in force. ### Putting a new solr configuration into place @@ -133,7 +142,7 @@ Solr should be reachable via the `solr-sdr-catalog` hostname. * (Optional) If your new solr config requires a full reindex, go ahead and get rid of the data with `rm -rf data` * Fire solr back up: `systemctl start solr-current-catalog` -* Give it a minute and then go to http://beeftea-2.umdl.umich.edu:9033/solr` to make sure the core came back up. +* Give it a minute and then go to `http://beeftea-2.umdl.umich.edu:9033/solr` to make sure the core came back up. * Do whatever indexing needs doing. ### Indexing @@ -193,6 +202,7 @@ The `index` command has a number of possibilities: > bundle exec bin/cictl help index Commands: cictl index all # Empty the catalog and index the most recent m... + cictl index continue # index all files not represented in the indexe... cictl index date YYYYMMDD # Run the catchup (delete and index) for a part... cictl index file FILE # Index a single file cictl index help [COMMAND] # Describe subcommands or one specific subcommand @@ -283,6 +293,8 @@ and `config/env`. The defaults in the repository suffice for testing under Docke ## Environment variables * `DDIR` data directory, defaults to `/htsolr/catalog/prep` + * `JOURNAL_DIRECTORY` location of journal files (see Date-Independent Indexing above) defaulting + to `journal/` inside the repo directory. * `LOG_DIR` where to store logs, defaults to `/htsolr/catalog/prep`. * `MYSQL_HOST`, `MYSQL_DATABASE`, `MYSQL_USER`, `MYSQL_PASSWORD` *required* unless run with `NO_DB`. * `NO_DB` if you want to skip all the database stuff. Useful for testing. Implied by `NO_EXTERNAL_DATA`. diff --git a/docker-compose.yml b/docker-compose.yml index 1d36ef4..9a0c42a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,4 @@ --- -version: '3' services: traject: diff --git a/lib/cictl/index_command.rb b/lib/cictl/index_command.rb index d70d674..749c7ad 100644 --- a/lib/cictl/index_command.rb +++ b/lib/cictl/index_command.rb @@ -3,12 +3,37 @@ require_relative "base_command" require_relative "zephir_file" require_relative "deleted_records" +require_relative "journal" module CICTL class IndexCommand < BaseCommand class_option :reader, type: :string, desc: "Reader name/path" class_option :writer, type: :string, desc: "Writer name/path" + desc "continue", "Index all files not represented in the indexer journals" + def continue + last_full = ZephirFile.full_files.last + fatal "unable to find full Zephir file" unless last_full + # Index the most recent full file and subsequent ones if the + # full file journal is missing. + full_journal = Journal.new(date: last_full.to_datetime.to_date, full: true) + if full_journal.missing? + logger.info "missing full journal #{full_journal}, calling `cictl all`" + call_all_command + # Otherwise, iterate from the last full file date to yesterday. + # If there is a missing journal, start indexing from that point. + else + (last_full.to_datetime.to_date..(Date.today - 1)).each do |date| + journal = Journal.new(date: last_full.to_datetime.to_date, full: false) + if journal.missing? + logger.info "missing update journal #{journal}, calling `cictl since #{journal.date}`" + call_since_command(journal.date) + break + end + end + end + end + desc "all", "Empty the catalog and index the most recent monthly followed by subsequent daily updates" option :wait, type: :boolean, desc: "Wait 5 seconds for Control-C", default: true def all @@ -39,6 +64,8 @@ def all solr_client.commit! end + # Note: this command does not write a journal since it only processes the MARC file + # but not the deletes. option :commit, type: :boolean, desc: "Commit changes to Solr", default: true desc "file FILE", "Index a single MARC file" def file(marcfile) @@ -52,7 +79,11 @@ def date(date) preflight with_date(date) do |date| index_deletes_for_date date - index_records_for_date date + if index_records_for_date date + journal = Journal.new(date: date) + logger.info("write journal file #{journal.path}") + journal.write! + end end end @@ -84,6 +115,7 @@ def today end no_commands do + alias_method :call_all_command, :all alias_method :call_date_command, :date alias_method :call_file_command, :file alias_method :call_since_command, :since @@ -118,14 +150,17 @@ def marc_file_for_date(date) ZephirFile.update_files.at(date) end + # @return [Boolean] true if the marcfile for the given date exists def index_records_for_date(date) marcfile = marc_file_for_date date if File.exist? marcfile Indexer.new(reader: options[:reader], writer: options[:writer]).run marcfile solr_client.commit! logger.debug "index date(#{date}): Solr count now #{solr_client.count}" + true else logger.warn "could not find marcfile '#{marcfile}'" + false end end diff --git a/lib/cictl/journal.rb b/lib/cictl/journal.rb new file mode 100644 index 0000000..4dd1952 --- /dev/null +++ b/lib/cictl/journal.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +require_relative "../services" + +module CICTL + # A class that enables date-independent catalog indexing using the filesystem. + # + # Each time a full or update file is indexed, writes an (empty) file of the form + # hathitrust_catalog_indexer_journal_upd_YYYYMMDD.txt or + # hathitrust_catalog_indexer_journal_full_YYYYMMDD.txt in the journal directory. + # + # When we use the index command `cictl continue` + # we calculate the earliest zephir file not yet indexed and proceed in order from + # that point. + # + # Nomenclature note: "journal" is the closest semantic match to "log" I could find. + # This is a log, of sorts, but the term was already taken. + class Journal + attr_reader :date + + FILENAME_PATTERN = /hathitrust_catalog_indexer_journal_(full|upd)_(\d{8})\.txt/ + + def self.filename_for(date:, full:) + yyyymmdd = date.strftime "%Y%m%d" + type = full ? "full" : "upd" + "hathitrust_catalog_indexer_journal_#{type}_#{yyyymmdd}.txt" + end + + def initialize(date: Date.today - 1, full: false) + @date = date + @full = full + end + + # Use the built-in but append the date and full/upd because that's what we care about. + def to_s + super.tap do |s| + s.gsub!(/>$/, " [#{date} #{full? ? "full" : "upd"}]>") + end + end + + def full? + @full + end + + # Of the form `hathitrust_catalog_indexer_journal_(full|upd)_YYYYMMDD.txt` + def file + self.class.filename_for(date: date, full: full?) + end + + def path + File.join(HathiTrust::Services[:journal_directory], file) + end + + def exist? + File.exist? path + end + + def missing? + !exist? + end + + def write! + FileUtils.touch path + end + end +end diff --git a/lib/ht_traject/ht_item.rb b/lib/ht_traject/ht_item.rb index 6cc4cec..f15a257 100644 --- a/lib/ht_traject/ht_item.rb +++ b/lib/ht_traject/ht_item.rb @@ -21,7 +21,6 @@ class << self attr_accessor :ht_ns, :ht_avail_us, :ht_avail_intl end - self.ht_ns = ::Traject::TranslationMap.new('ht/ht_namespace_map') self.ht_avail_us = ::Traject::TranslationMap.new('ht/availability_map_ht') self.ht_avail_intl = ::Traject::TranslationMap.new('ht/availability_map_ht_intl') @@ -256,10 +255,6 @@ def enum_pubdate=(e) end end - def source - ItemSet.ht_ns[namespace] - end - def us_availability ItemSet.ht_avail_us[rights].first end diff --git a/lib/services.rb b/lib/services.rb index 2fc62de..dfcd505 100644 --- a/lib/services.rb +++ b/lib/services.rb @@ -63,6 +63,14 @@ def env_local_file ENV["LOG_DIR"] || default end + Services.register(:journal_directory) do + (ENV["JOURNAL_DIRECTORY"] || File.join(HOME, "journal")).tap do |dir| + if !File.exist?(dir) + FileUtils.mkdir dir + end + end + end + Services.register(:redirect_file) do # Start migrating from redirect_file to REDIRECT_FILE on principle of least surprise ENV["redirect_file"] || ENV["REDIRECT_FILE"] || Redirects.default_redirects_file diff --git a/lib/translation_maps/ht/availability_map_ht.rb b/lib/translation_maps/ht/availability_map_ht.rb index 1cc5ca9..4a3752d 100644 --- a/lib/translation_maps/ht/availability_map_ht.rb +++ b/lib/translation_maps/ht/availability_map_ht.rb @@ -1,18 +1,27 @@ -require 'ht_traject/ht_constants' -require 'match_map' +require "ht_traject/ht_constants" +require "match_map" mm = MatchMap.new -mm[/^umall$/] = HathiTrust::Constants::FT -mm[/world$/] = HathiTrust::Constants::FT # matches world, ic-world, und-world -mm[/^cc.*/] = HathiTrust::Constants::FT -mm[/^pd(?:us)?$/] = HathiTrust::Constants::FT # pd or pdus +# Note: orph, orphcand, and umall are unattested in rights_current as of Oct 2024 -mm[/^ic$/] = HathiTrust::Constants::SO -mm[/^orph$/] = HathiTrust::Constants::SO -mm[/^nobody$/] = HathiTrust::Constants::SO -mm[/^und$/] = HathiTrust::Constants::SO -mm[/^pd-p/] = HathiTrust::Constants::SO # pd-pvt or pd-private -mm[/^opb?$/] = HathiTrust::Constants::SO +# Full Text +mm["pd"] = HathiTrust::Constants::FT # [1] +mm["ic-world"] = HathiTrust::Constants::FT # [7] +mm["pdus"] = HathiTrust::Constants::FT # [9] +mm[/^cc-/] = HathiTrust::Constants::FT # [10-15, 17, 20-25] +mm["und-world"] = HathiTrust::Constants::FT # [18] + +# Search Only +mm["ic"] = HathiTrust::Constants::SO # [2] +mm["op"] = HathiTrust::Constants::SO # [3] +mm["orph"] = HathiTrust::Constants::SO # [4] +mm["und"] = HathiTrust::Constants::SO # [5] +mm["umall"] = HathiTrust::Constants::SO # [6] +mm["nobody"] = HathiTrust::Constants::SO # [8] +mm["orphcand"] = HathiTrust::Constants::SO # [16] +mm["icus"] = HathiTrust::Constants::SO # [19] +mm["pd-pvt"] = HathiTrust::Constants::SO # [26] +mm["supp"] = HathiTrust::Constants::SO # [27] mm diff --git a/lib/translation_maps/ht/availability_map_ht_intl.rb b/lib/translation_maps/ht/availability_map_ht_intl.rb index f445b9e..cc38eaf 100644 --- a/lib/translation_maps/ht/availability_map_ht_intl.rb +++ b/lib/translation_maps/ht/availability_map_ht_intl.rb @@ -1,17 +1,26 @@ -require 'ht_traject/ht_constants' +require "ht_traject/ht_constants" mm = MatchMap.new -mm['umall'] = HathiTrust::Constants::FT -mm['world'] = HathiTrust::Constants::FT # matches world, ic-world, und-world -mm[/^cc.*/] = HathiTrust::Constants::FT -mm['pd'] = HathiTrust::Constants::FT +# Note: orph, orphcand, and umall are unattested in rights_current as of Oct 2024 -mm['pdus'] = HathiTrust::Constants::SO -mm['ic'] = HathiTrust::Constants::SO -mm[/^opb?$/] = HathiTrust::Constants::SO -mm['orph'] = HathiTrust::Constants::SO -mm['nobody'] = HathiTrust::Constants::SO -mm['und'] = HathiTrust::Constants::SO +# Full Text +mm["pd"] = HathiTrust::Constants::FT # [1] +mm["ic-world"] = HathiTrust::Constants::FT # [7] +mm[/^cc-/] = HathiTrust::Constants::FT # [10-15, 17, 20-25] +mm["und-world"] = HathiTrust::Constants::FT # [18] +mm["icus"] = HathiTrust::Constants::FT # [19] + +# Search Only +mm["ic"] = HathiTrust::Constants::SO # [2] +mm["op"] = HathiTrust::Constants::SO # [3] +mm["orph"] = HathiTrust::Constants::SO # [4] +mm["und"] = HathiTrust::Constants::SO # [5] +mm["umall"] = HathiTrust::Constants::SO # [6] +mm["nobody"] = HathiTrust::Constants::SO # [8] +mm["pdus"] = HathiTrust::Constants::SO # [9] +mm["orphcand"] = HathiTrust::Constants::SO # [16] +mm["pd-pvt"] = HathiTrust::Constants::SO # [26] +mm["supp"] = HathiTrust::Constants::SO # [27] mm diff --git a/lib/translation_maps/ht/country_map.rb b/lib/translation_maps/ht/country_map.rb index aeac89a..53cd8ef 100644 --- a/lib/translation_maps/ht/country_map.rb +++ b/lib/translation_maps/ht/country_map.rb @@ -1,319 +1,319 @@ { - 'xr' => 'Czech Republic', - 'xp' => 'Spratly Island', - 'xo' => 'Slovakia', - 'xn' => 'Macedonia', - 'xm' => 'Saint Vincent and the Grenadines', - 'xl' => 'Saint Pierre and Miquelon', - 'xk' => 'Saint Lucia', - 'xj' => 'Saint Helena', - 'xi' => 'Saint Kitts-Nevis-Anguilla', - 'xh' => 'Niue', - 'xf' => 'Midway Islands', - 'xe' => 'Marshall Islands', - 'xd' => 'Saint Kitts-Nevis', - 'xc' => 'Maldives', - 'xb' => 'Cocos (Keeling) Islands', - 'uik' => 'United Kingdom Misc. Islands', - 'xa' => 'Christmas Island (Indian Ocean)', - 'ajr' => 'Azerbaijan S.S.R.', - 'xoa' => 'Northern Territory', - 'ws' => 'Samoa', - 'wk' => 'Wake Island', - 'wj' => 'West Bank of the Jordan River', - 'wf' => 'Wallis and Futuna', - 'wb' => 'West Berlin', - 'air' => 'Armenian S.S.R.', - 'xna' => 'New South Wales', - 'vs' => 'Vietnam, South', - 'stk' => 'Scotland', - 'vp' => 'Various places', - 'vn' => 'Vietnam, North', - 'vm' => 'Vietnam', - 'vi' => 'Virgin Islands of the United States', - 've' => 'Venezuela', - 'vc' => 'Vatican City', - 'vb' => 'British Virgin Islands', - 'uz' => 'Uzbekistan', - 'uy' => 'Uruguay', - 'uv' => 'Burkina Faso', - 'us' => 'United States', - 'ur' => 'Soviet Union', - 'up' => 'United States Misc. Pacific Islands', - 'tma' => 'Tasmania', - 'un' => 'Ukraine', - 'uk' => 'United Kingdom', - 'ui' => 'United Kingdom Misc. Islands', - 'ug' => 'Uganda', - 'uc' => 'United States Misc. Caribbean Islands', - 'ua' => 'Egypt', - 'gsr' => 'Georgian S.S.R.', - 'tz' => 'Tanzania', - 'tv' => 'Tuvalu', - 'tu' => 'Turkey', - 'tt' => 'Trust Territory of the Pacific Islands', - 'ts' => 'United Arab Emirates', - 'tr' => 'Trinidad and Tobago', - 'to' => 'Tonga', - 'tl' => 'Tokelau', - 'tk' => 'Turkmenistan', - 'ti' => 'Tunisia', - 'th' => 'Thailand', - 'tg' => 'Togo', - 'tc' => 'Turks and Caicos Islands', - 'ta' => 'Tajikistan', - 'tkr' => 'Turkmen S.S.R.', - 'sz' => 'Switzerland', - 'sy' => 'Syria', - 'sx' => 'Namibia', - 'sw' => 'Sweden', - 'sv' => 'Swan Islands', - 'su' => 'Saudi Arabia', - 'ss' => 'Western Sahara', - 'sr' => 'Surinam', - 'sq' => 'Swaziland', - 'sp' => 'Spain', - 'so' => 'Somalia', - 'sm' => 'San Marino', - 'qea' => 'Queensland', - 'sl' => 'Sierra Leone', - 'sk' => 'Sikkim', - 'sj' => 'Sudan', - 'si' => 'Singapore', - 'sh' => 'Spanish North Africa', - 'sg' => 'Senegal', - 'sf' => 'Sao Tome and Principe', - 'se' => 'Seychelles', - 'sb' => 'Svalbard', - 'sa' => 'South Africa', - 'ry' => 'Ryukyu Islands, Southern', - 'rw' => 'Rwanda', - 'ru' => 'Russia (Federation)', - 'rm' => 'Romania', - 'bwr' => 'Byelorussian S.S.R.', - 'rh' => 'Zimbabwe', - 're' => "R\303\203\302\251union", - 'rb' => 'Serbia', - 'rur' => 'Russian S.F.S.R.', - 'lir' => 'Lithuania', - 'qa' => 'Qatar', - 'py' => 'Paraguay', - 'pw' => 'Palau', - 'pt' => 'Portuguese Timor', - 'pr' => 'Puerto Rico', - 'pp' => 'Papua New Guinea', - 'uzr' => 'Uzbek S.S.R.', - 'po' => 'Portugal', - 'pn' => 'Panama', - 'pl' => 'Poland', - 'pk' => 'Pakistan', - 'ph' => 'Philippines', - 'aca' => 'Australian Capital Territory', - 'pg' => 'Guinea-Bissau', - 'pf' => 'Paracel Islands', - 'pe' => 'Peru', - 'pc' => 'Pitcairn Island', - 'xga' => 'Coral Sea Islands Territory', - 'ot' => 'Mayotte', - 'nz' => 'New Zealand', - 'wlk' => 'Wales', - 'nx' => 'Norfolk Island', - 'nw' => 'Northern Mariana Islands', - 'nu' => 'Nauru', - 'nr' => 'Nigeria', - 'nq' => 'Nicaragua', - 'np' => 'Nepal', - 'no' => 'Norway', - 'nn' => 'Vanuatu', - 'nm' => 'Northern Mariana Islands', - 'nl' => 'New Caledonia', - 'ng' => 'Niger', - 'vra' => 'Victoria', - 'ne' => 'Netherlands', - 'na' => 'Netherlands Antilles', - 'mz' => 'Mozambique', - 'my' => 'Malaysia', - 'mx' => 'Mexico', - 'mw' => 'Malawi', - 'mv' => 'Moldova', - 'mu' => 'Mauritania', - 'mr' => 'Morocco', - 'mq' => 'Martinique', - 'mp' => 'Mongolia', - 'mo' => 'Montenegro', - 'mm' => 'Malta', - 'ml' => 'Mali', - 'mk' => 'Oman', - 'mj' => 'Montserrat', - 'mh' => 'Macao', - 'mg' => 'Madagascar', - 'mf' => 'Mauritius', - '|||' => 'No place, unknown, or undetermined', - 'mc' => 'Monaco', - 'ly' => 'Libya', - 'lv' => 'Latvia', - 'lu' => 'Luxembourg', - 'ls' => 'Laos', - 'lo' => 'Lesotho', - 'ln' => 'Central and Southern Line Islands', - 'li' => 'Lithuania', - 'lh' => 'Liechtenstein', - 'le' => 'Lebanon', - 'lb' => 'Liberia', - 'mvr' => 'Moldavian S.S.R.', - 'kz' => 'Kazakhstan', - 'ky' => 'uKentucky', - 'kv' => 'Kosovo', - 'ku' => 'Kuwait', - 'ko' => 'Korea (South)', - 'kn' => 'Korea (North)', - 'kg' => 'Kyrgyzstan', - 'ke' => 'Kenya', - 'jo' => 'Jordan', - 'jn' => 'Jan Mayen', - 'jm' => 'Jamaica', - 'ji' => 'Johnston Atoll', - 'ja' => 'Japan', - 'tar' => 'Tajik S.S.R.', - 'iy' => 'Iraq-Saudi Arabia Neutral Zone', - 'iw' => 'Israel-Jordan Demilitarized Zones', - 'iv' => "C\303\203\302\264te d'Ivoire", - 'iu' => 'Israel-Syria Demilitarized Zones', - 'it' => 'Italy', - 'is' => 'Israel', - 'ir' => 'Iran', - 'iq' => 'Iraq', - 'io' => 'Indonesia', - 'ii' => 'India', - 'kgr' => 'Kirghiz S.S.R.', - 'ie' => 'Ireland', - 'ic' => 'Iceland', - 'hu' => 'Hungary', - 'xxu' => 'United States', - 'ht' => 'Haiti', - 'xxr' => 'Soviet Union', - 'ho' => 'Honduras', - 'hm' => 'Heard and McDonald Islands', - 'hk' => 'Hong Kong', - 'xxk' => 'United Kingdom', - 'xxc' => 'Canada', - 'gz' => 'Gaza Strip', - 'gy' => 'Guyana', - 'gw' => 'Germany', - 'gv' => 'Guinea', - 'gu' => 'Guam', - 'gt' => 'Guatemala', - 'gs' => 'Georgia (Republic)', - 'gr' => 'Greece', - 'gp' => 'Guadeloupe', - 'wea' => 'Western Australia', - 'go' => 'Gabon', - 'gn' => 'Gilbert and Ellice Islands', - 'gm' => 'Gambia', - 'gl' => 'Greenland', - 'err' => 'Estonia', - 'gi' => 'Gibraltar', - 'gh' => 'Ghana', - 'ge' => 'Germany (East)', - 'gd' => 'Grenada', - 'gb' => 'Kiribati', - 'ft' => 'Djibouti', - 'fs' => "Terres australes et antarctiques fran\303\203\302\247aises", - 'fr' => 'France', - 'fp' => 'French Polynesia', - 'fm' => 'Micronesia (Federated States)', - 'fk' => 'Falkland Islands', - 'fj' => 'Fiji', - 'fi' => 'Finland', - 'fg' => 'French Guiana', - 'fa' => 'Faroe Islands', - 'lvr' => 'Latvia', - 'et' => 'Ethiopia', - 'es' => 'El Salvador', - 'er' => 'Estonia', - 'em' => 'East Timor', - 'eg' => 'Equatorial Guinea', - 'ec' => 'Ecuador', - 'ea' => 'Eritrea', - 'nik' => 'Northern Ireland', - 'dr' => 'Dominican Republic', - 'dq' => 'Dominica', - 'unr' => 'Ukraine', - 'dm' => 'Benin', - 'dk' => 'Denmark', - '|' => 'No place, unknown, or undetermined', - 'cz' => 'Canal Zone', - 'cy' => 'Cyprus', - 'cx' => 'Central African Republic', - 'cw' => 'Cook Islands', - 'cv' => 'Cape Verde', - 'cu' => 'Cuba', - 'u' => 'United States', - 'cs' => 'Czechoslovakia', - 'cr' => 'Costa Rica', - 'r' => 'Soviet Socialist Republic', - 'cq' => 'Comoros', - 'cp' => 'Canton and Enderbury Islands', - 'cn' => 'Canada', - 'cm' => 'Cameroon', - 'kzr' => 'Kazakh S.S.R.', - 'cl' => 'Chile', - 'ck' => 'Colombia', - 'k' => 'United Kingdom', - 'cj' => 'Cayman Islands', - 'ci' => 'Croatia', - 'ch' => 'China (Republic : 1949- )', - 'cg' => 'Congo (Democratic Republic)', - 'cf' => 'Congo (Brazzaville)', - 'ce' => 'Sri Lanka', - 'cd' => 'Chad', - 'enk' => 'England', - 'cc' => 'China', - 'c' => 'Canada', - 'cb' => 'Cambodia', - 'a' => 'Australia', - 'bx' => 'Brunei', - 'bw' => 'Belarus', - 'bv' => 'Bouvet Island', - 'bu' => 'Bulgaria', - 'bt' => 'Bhutan', - 'bs' => 'Botswana', - 'br' => 'Burma', - 'bp' => 'Solomon Islands', - 'bo' => 'Bolivia', - 'bn' => 'Bosnia and Hercegovina', - 'bm' => 'Bermuda Islands', - 'bl' => 'Brazil', - 'bi' => 'British Indian Ocean Territory', - 'bh' => 'Belize', - 'bg' => 'Bangladesh', - 'bf' => 'Bahamas', - 'be' => 'Belgium', - 'bd' => 'Burundi', - 'bb' => 'Barbados', - 'ba' => 'Bahrain', - 'xra' => 'South Australia', - 'ay' => 'Antarctica', - 'aw' => 'Aruba', - 'au' => 'Austria', - 'at' => 'Australia', - 'as' => 'American Samoa', - 'aq' => 'Antigua and Barbuda', - 'ao' => 'Angola', - 'an' => 'Andorra', - 'am' => 'Anguilla', - 'aj' => 'Azerbaijan', - 'za' => 'Zambia', - 'ai' => 'Armenia (Republic)', - 'ag' => 'Argentina', - 'af' => 'Afghanistan', - 'ae' => 'Algeria', - 'ac' => 'Ashmore and Cartier Islands', - 'aa' => 'Albania', - 'yu' => 'Serbia and Montenegro', - 'ys' => "Yemen (People's Democratic Republic)", - 'ye' => 'Yemen', - 'xx' => 'No place, unknown, or undetermined', - 'xv' => 'Slovenia', - 'xs' => 'South Georgia and the South Sandwich Islands' + "xr" => "Czech Republic", + "xp" => "Spratly Island", + "xo" => "Slovakia", + "xn" => "Macedonia", + "xm" => "Saint Vincent and the Grenadines", + "xl" => "Saint Pierre and Miquelon", + "xk" => "Saint Lucia", + "xj" => "Saint Helena", + "xi" => "Saint Kitts-Nevis-Anguilla", + "xh" => "Niue", + "xf" => "Midway Islands", + "xe" => "Marshall Islands", + "xd" => "Saint Kitts-Nevis", + "xc" => "Maldives", + "xb" => "Cocos (Keeling) Islands", + "uik" => "United Kingdom Misc. Islands", + "xa" => "Christmas Island (Indian Ocean)", + "ajr" => "Azerbaijan S.S.R.", + "xoa" => "Northern Territory", + "ws" => "Samoa", + "wk" => "Wake Island", + "wj" => "West Bank of the Jordan River", + "wf" => "Wallis and Futuna", + "wb" => "West Berlin", + "air" => "Armenian S.S.R.", + "xna" => "New South Wales", + "vs" => "Vietnam, South", + "stk" => "Scotland", + "vp" => "Various places", + "vn" => "Vietnam, North", + "vm" => "Vietnam", + "vi" => "Virgin Islands of the United States", + "ve" => "Venezuela", + "vc" => "Vatican City", + "vb" => "British Virgin Islands", + "uz" => "Uzbekistan", + "uy" => "Uruguay", + "uv" => "Burkina Faso", + "us" => "United States", + "ur" => "Soviet Union", + "up" => "United States Misc. Pacific Islands", + "tma" => "Tasmania", + "un" => "Ukraine", + "uk" => "United Kingdom", + "ui" => "United Kingdom Misc. Islands", + "ug" => "Uganda", + "uc" => "United States Misc. Caribbean Islands", + "ua" => "Egypt", + "gsr" => "Georgian S.S.R.", + "tz" => "Tanzania", + "tv" => "Tuvalu", + "tu" => "Turkey", + "tt" => "Trust Territory of the Pacific Islands", + "ts" => "United Arab Emirates", + "tr" => "Trinidad and Tobago", + "to" => "Tonga", + "tl" => "Tokelau", + "tk" => "Turkmenistan", + "ti" => "Tunisia", + "th" => "Thailand", + "tg" => "Togo", + "tc" => "Turks and Caicos Islands", + "ta" => "Tajikistan", + "tkr" => "Turkmen S.S.R.", + "sz" => "Switzerland", + "sy" => "Syria", + "sx" => "Namibia", + "sw" => "Sweden", + "sv" => "Swan Islands", + "su" => "Saudi Arabia", + "ss" => "Western Sahara", + "sr" => "Surinam", + "sq" => "Swaziland", + "sp" => "Spain", + "so" => "Somalia", + "sm" => "San Marino", + "qea" => "Queensland", + "sl" => "Sierra Leone", + "sk" => "Sikkim", + "sj" => "Sudan", + "si" => "Singapore", + "sh" => "Spanish North Africa", + "sg" => "Senegal", + "sf" => "Sao Tome and Principe", + "se" => "Seychelles", + "sb" => "Svalbard", + "sa" => "South Africa", + "ry" => "Ryukyu Islands, Southern", + "rw" => "Rwanda", + "ru" => "Russia (Federation)", + "rm" => "Romania", + "bwr" => "Byelorussian S.S.R.", + "rh" => "Zimbabwe", + "re" => "R\303\203\302\251union", + "rb" => "Serbia", + "rur" => "Russian S.F.S.R.", + "lir" => "Lithuania", + "qa" => "Qatar", + "py" => "Paraguay", + "pw" => "Palau", + "pt" => "Portuguese Timor", + "pr" => "Puerto Rico", + "pp" => "Papua New Guinea", + "uzr" => "Uzbek S.S.R.", + "po" => "Portugal", + "pn" => "Panama", + "pl" => "Poland", + "pk" => "Pakistan", + "ph" => "Philippines", + "aca" => "Australian Capital Territory", + "pg" => "Guinea-Bissau", + "pf" => "Paracel Islands", + "pe" => "Peru", + "pc" => "Pitcairn Island", + "xga" => "Coral Sea Islands Territory", + "ot" => "Mayotte", + "nz" => "New Zealand", + "wlk" => "Wales", + "nx" => "Norfolk Island", + "nw" => "Northern Mariana Islands", + "nu" => "Nauru", + "nr" => "Nigeria", + "nq" => "Nicaragua", + "np" => "Nepal", + "no" => "Norway", + "nn" => "Vanuatu", + "nm" => "Northern Mariana Islands", + "nl" => "New Caledonia", + "ng" => "Niger", + "vra" => "Victoria", + "ne" => "Netherlands", + "na" => "Netherlands Antilles", + "mz" => "Mozambique", + "my" => "Malaysia", + "mx" => "Mexico", + "mw" => "Malawi", + "mv" => "Moldova", + "mu" => "Mauritania", + "mr" => "Morocco", + "mq" => "Martinique", + "mp" => "Mongolia", + "mo" => "Montenegro", + "mm" => "Malta", + "ml" => "Mali", + "mk" => "Oman", + "mj" => "Montserrat", + "mh" => "Macao", + "mg" => "Madagascar", + "mf" => "Mauritius", + "|||" => "No place, unknown, or undetermined", + "mc" => "Monaco", + "ly" => "Libya", + "lv" => "Latvia", + "lu" => "Luxembourg", + "ls" => "Laos", + "lo" => "Lesotho", + "ln" => "Central and Southern Line Islands", + "li" => "Lithuania", + "lh" => "Liechtenstein", + "le" => "Lebanon", + "lb" => "Liberia", + "mvr" => "Moldavian S.S.R.", + "kz" => "Kazakhstan", + "ky" => "uKentucky", + "kv" => "Kosovo", + "ku" => "Kuwait", + "ko" => "Korea (South)", + "kn" => "Korea (North)", + "kg" => "Kyrgyzstan", + "ke" => "Kenya", + "jo" => "Jordan", + "jn" => "Jan Mayen", + "jm" => "Jamaica", + "ji" => "Johnston Atoll", + "ja" => "Japan", + "tar" => "Tajik S.S.R.", + "iy" => "Iraq-Saudi Arabia Neutral Zone", + "iw" => "Israel-Jordan Demilitarized Zones", + "iv" => "C\303\203\302\264te d'Ivoire", + "iu" => "Israel-Syria Demilitarized Zones", + "it" => "Italy", + "is" => "Israel", + "ir" => "Iran", + "iq" => "Iraq", + "io" => "Indonesia", + "ii" => "India", + "kgr" => "Kirghiz S.S.R.", + "ie" => "Ireland", + "ic" => "Iceland", + "hu" => "Hungary", + "xxu" => "United States", + "ht" => "Haiti", + "xxr" => "Soviet Union", + "ho" => "Honduras", + "hm" => "Heard and McDonald Islands", + "hk" => "Hong Kong", + "xxk" => "United Kingdom", + "xxc" => "Canada", + "gz" => "Gaza Strip", + "gy" => "Guyana", + "gw" => "Germany", + "gv" => "Guinea", + "gu" => "Guam", + "gt" => "Guatemala", + "gs" => "Georgia (Republic)", + "gr" => "Greece", + "gp" => "Guadeloupe", + "wea" => "Western Australia", + "go" => "Gabon", + "gn" => "Gilbert and Ellice Islands", + "gm" => "Gambia", + "gl" => "Greenland", + "err" => "Estonia", + "gi" => "Gibraltar", + "gh" => "Ghana", + "ge" => "Germany (East)", + "gd" => "Grenada", + "gb" => "Kiribati", + "ft" => "Djibouti", + "fs" => "Terres australes et antarctiques fran\303\203\302\247aises", + "fr" => "France", + "fp" => "French Polynesia", + "fm" => "Micronesia (Federated States)", + "fk" => "Falkland Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fg" => "French Guiana", + "fa" => "Faroe Islands", + "lvr" => "Latvia", + "et" => "Ethiopia", + "es" => "El Salvador", + "er" => "Estonia", + "em" => "East Timor", + "eg" => "Equatorial Guinea", + "ec" => "Ecuador", + "ea" => "Eritrea", + "nik" => "Northern Ireland", + "dr" => "Dominican Republic", + "dq" => "Dominica", + "unr" => "Ukraine", + "dm" => "Benin", + "dk" => "Denmark", + "|" => "No place, unknown, or undetermined", + "cz" => "Canal Zone", + "cy" => "Cyprus", + "cx" => "Central African Republic", + "cw" => "Cook Islands", + "cv" => "Cape Verde", + "cu" => "Cuba", + "u" => "United States", + "cs" => "Czechoslovakia", + "cr" => "Costa Rica", + "r" => "Soviet Socialist Republic", + "cq" => "Comoros", + "cp" => "Canton and Enderbury Islands", + "cn" => "Canada", + "cm" => "Cameroon", + "kzr" => "Kazakh S.S.R.", + "cl" => "Chile", + "ck" => "Colombia", + "k" => "United Kingdom", + "cj" => "Cayman Islands", + "ci" => "Croatia", + "ch" => "China (Republic : 1949- )", + "cg" => "Congo (Democratic Republic)", + "cf" => "Congo (Brazzaville)", + "ce" => "Sri Lanka", + "cd" => "Chad", + "enk" => "England", + "cc" => "China", + "c" => "Canada", + "cb" => "Cambodia", + "a" => "Australia", + "bx" => "Brunei", + "bw" => "Belarus", + "bv" => "Bouvet Island", + "bu" => "Bulgaria", + "bt" => "Bhutan", + "bs" => "Botswana", + "br" => "Burma", + "bp" => "Solomon Islands", + "bo" => "Bolivia", + "bn" => "Bosnia and Hercegovina", + "bm" => "Bermuda Islands", + "bl" => "Brazil", + "bi" => "British Indian Ocean Territory", + "bh" => "Belize", + "bg" => "Bangladesh", + "bf" => "Bahamas", + "be" => "Belgium", + "bd" => "Burundi", + "bb" => "Barbados", + "ba" => "Bahrain", + "xra" => "South Australia", + "ay" => "Antarctica", + "aw" => "Aruba", + "au" => "Austria", + "at" => "Australia", + "as" => "American Samoa", + "aq" => "Antigua and Barbuda", + "ao" => "Angola", + "an" => "Andorra", + "am" => "Anguilla", + "aj" => "Azerbaijan", + "za" => "Zambia", + "ai" => "Armenia (Republic)", + "ag" => "Argentina", + "af" => "Afghanistan", + "ae" => "Algeria", + "ac" => "Ashmore and Cartier Islands", + "aa" => "Albania", + "yu" => "Serbia and Montenegro", + "ys" => "Yemen (People's Democratic Republic)", + "ye" => "Yemen", + "xx" => "No place, unknown, or undetermined", + "xv" => "Slovenia", + "xs" => "South Georgia and the South Sandwich Islands" } diff --git a/lib/translation_maps/ht/ht_namespace_map.rb b/lib/translation_maps/ht/ht_namespace_map.rb deleted file mode 100644 index 80adbd4..0000000 --- a/lib/translation_maps/ht/ht_namespace_map.rb +++ /dev/null @@ -1,34 +0,0 @@ -{ 'mdp' => 'University of Michigan', - 'miua' => 'University of Michigan', - 'miun' => 'University of Michigan', - 'wu' => 'University of Wisconsin', - 'inu' => 'Indiana University', - 'uc1' => 'University of California', - 'uc2' => 'University of California', - 'pst' => 'Penn State University', - 'umn' => 'University of Minnesota', - 'nnc1' => 'Columbia University', - 'nnc2' => 'Columbia University', - 'nyp' => 'New York Public Library', - 'uiuo' => 'University of Illinois', - 'njp' => 'Princeton University', - 'yale' => 'Yale University', - 'chi' => 'University of Chicago', - 'coo' => 'Cornell University', - 'ucm' => 'Universidad Complutense de Madrid', - 'loc' => 'Library of Congress', - 'ien' => 'Northwestern University', - 'hvd' => 'Harvard University', - 'uva' => 'University of Virginia', - 'dul1' => 'Duke University', - 'ncs1' => 'North Carolina State University', - 'nc01' => 'University of North Carolina', - 'pur1' => 'Purdue University', - 'pur2' => 'Purdue University', - 'mdl' => 'Minnesota Digital Library', - 'usu' => 'Utah State University Press', - 'gri' => 'Getty Research Institute', - 'uiug' => 'University of Illinois', - 'psia' => 'Penn State University', - 'bc' => 'Boston College', - 'ufl1' => 'University of Florida' } diff --git a/lib/translation_maps/umich/availability_map_umich.rb b/lib/translation_maps/umich/availability_map_umich.rb deleted file mode 100644 index 63b8db5..0000000 --- a/lib/translation_maps/umich/availability_map_umich.rb +++ /dev/null @@ -1,10 +0,0 @@ -{ - 'avail_ht_fulltext' => 'Full text available online via HathiTrust', - 'avail_circ' => 'Circulating volumes', - 'avail_checkout' => 'Available to check out', - 'avail_online' => 'Available online', - 'avail_recall' => 'Checked out (Available for recall)', - 'avail_ht' => 'HathiTrust', - 'avail_ebm_pod' => 'Available for Print on Demand' - -} diff --git a/lib/translation_maps/umich/institution_map.rb b/lib/translation_maps/umich/institution_map.rb deleted file mode 100644 index 920fc7a..0000000 --- a/lib/translation_maps/umich/institution_map.rb +++ /dev/null @@ -1,8 +0,0 @@ -{ - 'MiAaUTR' => ['UM Ann Arbor Libraries', 'Transportation Research Institute Library (UMTRI)'], - 'MIU' => ['UM Ann Arbor Libraries', 'University Library'], - 'MiU' => ['UM Ann Arbor Libraries', 'University Library'], - 'MiU-C' => ['UM Ann Arbor Libraries', 'University Library', 'William L. Clements Library'], - 'MiU-H' => ['UM Ann Arbor Libraries', 'University Library', 'Bentley Historical Library'], - 'MiFliC' => 'Flint Thompson Library' -} diff --git a/spec/cictl/delete_command_spec.rb b/spec/cictl/delete_command_spec.rb index 17979ed..ae8a281 100644 --- a/spec/cictl/delete_command_spec.rb +++ b/spec/cictl/delete_command_spec.rb @@ -3,21 +3,16 @@ require "spec_helper" RSpec.describe CICTL::DeleteCommand do - before(:each) do - CICTL::SolrClient.new.empty!.commit! - ENV["CICTL_ZEPHIR_FILE_TEMPLATE_PREFIX"] = "sample" - end - - after(:each) do - CICTL::SolrClient.new.empty!.commit! - ENV.delete "CICTL_ZEPHIR_FILE_TEMPLATE_PREFIX" - remove_test_log + around(:each) do |example| + with_test_environment do |tmpdir| + example.run + end end describe "#delete all" do it "deletes all records" do example = CICTL::Examples.for_date("20230103", type: :upd).first - file = File.join(HathiTrust::Services["data_directory"], example[:file]) + file = File.join(HathiTrust::Services[:data_directory], example[:file]) CICTL::Commands.start(["index", "file", file, "--log", test_log]) expect(solr_count).to be > 0 CICTL::Commands.start(["delete", "all", "--log", test_log]) @@ -28,29 +23,29 @@ describe "#delete file" do it "deletes 1 record" do upd_example = CICTL::Examples.for_date("20230102", type: :upd).first - file = File.join(HathiTrust::Services["data_directory"], upd_example[:file]) + file = File.join(HathiTrust::Services[:data_directory], upd_example[:file]) CICTL::Commands.start(["index", "file", file, "--log", test_log]) expect(solr_count).to eq upd_example[:ids].count expect(solr_deleted_count).to eq 0 delete_example = CICTL::Examples.for_date("20230102", type: :delete).first - file = File.join(HathiTrust::Services["data_directory"], delete_example[:file]) + file = File.join(HathiTrust::Services[:data_directory], delete_example[:file]) CICTL::Commands.start(["delete", "file", file, "--log", test_log]) expect(solr_count).to eq (upd_example[:ids] + delete_example[:ids]).uniq.count expect(solr_deleted_count).to eq delete_example[:ids].count end it "handles empty file" do - file = File.join(HathiTrust::Services["data_directory"], CICTL::Examples.empty_delete_file) + file = File.join(HathiTrust::Services[:data_directory], CICTL::Examples.empty_delete_file) CICTL::Commands.start(["delete", "file", file, "--log", test_log]) end it "handles file with spaces-only line" do - file = File.join(HathiTrust::Services["data_directory"], CICTL::Examples.blank_line_delete_file) + file = File.join(HathiTrust::Services[:data_directory], CICTL::Examples.blank_line_delete_file) CICTL::Commands.start(["delete", "file", file, "--log", test_log]) end it "errors on noisy file" do - file = File.join(HathiTrust::Services["data_directory"], CICTL::Examples.noisy_delete_file) + file = File.join(HathiTrust::Services[:data_directory], CICTL::Examples.noisy_delete_file) expect { CICTL::Commands.start(["delete", "file", file, "--log", test_log]) }.to raise_error(RSolr::Error::Http) end end diff --git a/spec/cictl/index_command_spec.rb b/spec/cictl/index_command_spec.rb index 11fb696..d14ac39 100644 --- a/spec/cictl/index_command_spec.rb +++ b/spec/cictl/index_command_spec.rb @@ -2,17 +2,52 @@ require "spec_helper" require "cictl/deleted_records" +require "cictl/journal" RSpec.describe CICTL::IndexCommand do - before(:each) do - CICTL::SolrClient.new.empty!.commit! - ENV["CICTL_ZEPHIR_FILE_TEMPLATE_PREFIX"] = "sample" + around(:each) do |example| + with_test_environment do |tmpdir| + example.run + end end - after(:each) do - CICTL::SolrClient.new.empty!.commit! - ENV.delete "CICTL_ZEPHIR_FILE_TEMPLATE_PREFIX" - remove_test_log + describe "#index continue" do + context "with no journal" do + it "indexes all example records" do + update_file_count = CICTL::Examples.of_type(:upd).count + CICTL::Commands.start(["index", "continue", "--quiet", "--log", test_log]) + expect(solr_count).to eq CICTL::Examples.all_ids.count + expect(Dir.children(HathiTrust::Services[:journal_directory]).count).to eq(update_file_count) + end + end + + context "with only the full file" do + it "indexes only the update files and writes a journal for each" do + CICTL::Examples.of_type(:full).each do |ex| + CICTL::Examples.journal_for(example: ex).write! + end + update_file_count = CICTL::Examples.of_type(:upd).count + update_ids = CICTL::Examples.of_type(:upd, :delete).each_with_object([]) do |ex, ids| + ex[:ids].each { |id| ids << id } + end.uniq + old_journal_count = Dir.children(HathiTrust::Services[:journal_directory]).count + CICTL::Commands.start(["index", "continue", "--quiet", "--log", test_log]) + expect(solr_count).to eq update_ids.count + expect(Dir.children(HathiTrust::Services[:journal_directory]).count).to eq(old_journal_count + update_file_count) + end + end + + context "with a full journal" do + it "indexes nothing and writes no journals" do + CICTL::Examples.of_type(:full, :upd).each do |ex| + CICTL::Examples.journal_for(example: ex).write! + end + old_journal_count = Dir.children(HathiTrust::Services[:journal_directory]).count + CICTL::Commands.start(["index", "continue", "--quiet", "--log", test_log]) + expect(solr_count).to eq 0 + expect(Dir.children(HathiTrust::Services[:journal_directory]).count).to eq(old_journal_count) + end + end end describe "#index all" do @@ -24,6 +59,7 @@ expect(solr_count).to eq CICTL::Examples.all_ids.count + 1 expect(solr_deleted_count).to be > 0 expect(solr_ids("deleted:true")).to include(bogus_delete) + expect(Dir.children(HathiTrust::Services[:journal_directory]).count).to be > 0 end context "using nonexistent redirect file" do @@ -65,6 +101,7 @@ examples = CICTL::Examples.for_date("20230103") CICTL::Commands.start(["index", "date", "20230103", "--log", test_log]) expect(solr_count).to eq examples.map { |ex| ex[:ids] }.flatten.uniq.count + expect(File.exist?(CICTL::Journal.new(date: Date.new(2023, 1, 3)).path)).to eq(true) end it "raises on bogus date" do @@ -138,7 +175,7 @@ after(:each) { HathiTrust::Services.register(:data_directory) { @save_dd } } # Note that "today" means "index today, using the file dated yesterday" - it "indexes 'today' and produces deletes file" do + it "indexes 'today' and produces deletes file and journal file" do update_source = CICTL::ZephirFile.update_files.last del_source = CICTL::ZephirFile.delete_files.last @@ -161,6 +198,7 @@ expect(solr_count).to eq(zcount + delcount) expect(CICTL::DeletedRecords.daily_file.readable?) expect(Zinzout.zin(CICTL::DeletedRecords.daily_file).count).to eq(delcount) + expect(File.exist?(CICTL::Journal.new(date: Date.today - 1).path)).to eq(true) end end end diff --git a/spec/cictl/journal_spec.rb b/spec/cictl/journal_spec.rb new file mode 100644 index 0000000..bceb472 --- /dev/null +++ b/spec/cictl/journal_spec.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe CICTL::Journal do + around(:each) do |example| + with_test_environment do |tmpdir| + example.run + end + end + + describe "#initialize" do + it "creates a CICTL::Journal object" do + journal = described_class.new + expect(journal).to be_kind_of(CICTL::Journal) + end + end + + describe "#file" do + it "returns a filename matching the class regular expression" do + journal = described_class.new + expect(journal.file).to match(CICTL::Journal::FILENAME_PATTERN) + end + end + + describe "#write" do + context "with default date" do + it "creates the file" do + journal = described_class.new + journal.write! + expect(File.exist?(journal.path)).to eq true + expect(journal.path).to match((Date.today - 1).strftime("%Y%m%d")) + end + end + + context "with another date" do + it "creates the file" do + date = Date.new(2020, 6, 1) + journal = described_class.new(date: date) + journal.write! + expect(File.exist?(journal.path)).to eq true + expect(journal.path).to match(date.strftime("%Y%m%d")) + end + end + end +end diff --git a/spec/cictl/logger_factory_spec.rb b/spec/cictl/logger_factory_spec.rb index 96d6503..047bd74 100644 --- a/spec/cictl/logger_factory_spec.rb +++ b/spec/cictl/logger_factory_spec.rb @@ -4,14 +4,18 @@ require_relative "../../lib/cictl/logfile_defaults" require_relative "../../lib/services" -ENV["CICTL_SEMANTIC_LOGGER_SYNC"] = "1" RSpec.describe CICTL::LoggerFactory do - def testlogger(verbose: false, log_file: test_log, quiet: false) - CICTL::LoggerFactory.new(verbose: verbose, log_file: log_file, quiet: quiet).logger + around(:each) do |example| + ClimateControl.modify(CICTL_SEMANTIC_LOGGER_SYNC: "1") do + with_test_environment do |tmpdir| + @test_log_path = File.join(HathiTrust::Services[:logfile_directory], test_log) + example.run + end + end end - after(:each) do - remove_test_log + def testlogger(verbose: false, log_file: test_log, quiet: false) + CICTL::LoggerFactory.new(verbose: verbose, log_file: log_file, quiet: quiet).logger end it "sends #error to $stderr" do @@ -28,7 +32,7 @@ def testlogger(verbose: false, log_file: test_log, quiet: false) it "sends stuff to the logfile" do testlogger.error "error-in-file" - expect(File.read(HathiTrust::Services[:logfile_directory] + "/" + test_log)).to match(/error-in-file/) + expect(File.read(@test_log_path)).to match(/error-in-file/) end it "does not send anything less than #error to STDERR" do @@ -44,4 +48,14 @@ def testlogger(verbose: false, log_file: test_log, quiet: false) testlogger(quiet: true).error("Error") }.not_to output(/Error/).to_stderr_from_any_process end + + it "maps --log=daily into today's date" do + testlogger(log_file: "daily").info "info-in-file" + expect(Dir.children(HathiTrust::Services[:logfile_directory]).first).to match(/daily_\d{8}\.log/) + end + + it "maps --log=full into today's date" do + testlogger(log_file: "full").info "info-in-file" + expect(Dir.children(HathiTrust::Services[:logfile_directory]).first).to match(/full_\d{8}\.log/) + end end diff --git a/spec/examples.rb b/spec/examples.rb index ecbd32d..cf165a8 100644 --- a/spec/examples.rb +++ b/spec/examples.rb @@ -1,5 +1,7 @@ # frozen_string_literal: true +require "cictl/journal" + # These are the examples used as fixtures in the cictl specs. # There is a single monthly file with 16 items. # There is an update file with the same date as the monthly, having 32 items. @@ -64,6 +66,10 @@ def for_date(date, type: nil) end end + def of_type(*types) + EXAMPLES.select { |ex| types.include? ex[:type] } + end + # delete file with single newline def empty_delete_file "sample_empty_delete.txt.gz" @@ -84,7 +90,13 @@ def redirects_file "redirects/sample_redirects.txt.gz" end - module_function :all_ids, :for_date, :empty_delete_file, :noisy_delete_file, - :blank_line_delete_file, :redirects_file + def journal_for(example:) + if [:full, :upd].include? example[:type] + CICTL::Journal.new(date: Date.parse(example[:date]), full: example[:type] == :full) + end + end + + module_function :all_ids, :for_date, :of_type, :empty_delete_file, :noisy_delete_file, + :blank_line_delete_file, :redirects_file, :journal_for end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 8c84d06..4d10ace 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,7 +1,9 @@ ENV["JRUBY_OPTS"] = "--debug #{ENV["JRUBY_OPTS"]}" +require "climate_control" require "simplecov" require "simplecov-lcov" +require "tmpdir" SimpleCov::Formatter::LcovFormatter.config do |c| c.report_with_single_file = true @@ -19,6 +21,55 @@ require_relative "../lib/ht_traject" require_relative "examples" +def with_test_environment + CICTL::SolrClient.new.empty!.commit! + Dir.mktmpdir do |tmpdir| + ClimateControl.modify(CICTL_ZEPHIR_FILE_TEMPLATE_PREFIX: "sample") do + old_logfile_directory = HathiTrust::Services[:logfile_directory] + old_journal_directory = HathiTrust::Services[:journal_directory] + new_logfile_directory = File.join(tmpdir, "logs") + new_journal_directory = File.join(tmpdir, "journal") + FileUtils.mkdir(new_logfile_directory) unless File.exist?(new_logfile_directory) + FileUtils.mkdir(new_journal_directory) unless File.exist?(new_journal_directory) + HathiTrust::Services.register(:logfile_directory) { new_logfile_directory } + HathiTrust::Services.register(:journal_directory) { new_journal_directory } + yield tmpdir + HathiTrust::Services.register(:logfile_directory) { old_logfile_directory } + HathiTrust::Services.register(:journal_directory) { old_journal_directory } + end + end +end + +# Typically created in a temp directory +def test_log + "TEST_LOG.txt" +end + +def solr_count + CICTL::SolrClient.new.count +end + +def solr_deleted_count + CICTL::SolrClient.new.count_deleted +end + +def solr_ids(q = "*:*") + solr_params = {q: q, wt: "ruby", rows: 100} + response = CICTL::SolrClient.new.get("select", params: solr_params) + response["response"]["docs"].map { |doc| doc["id"] } +end + +def override_service(key, &block) + around(:each) do |example| + old_val = HathiTrust::Services[key] + HathiTrust::Services.register(key, &block) + example.run + HathiTrust::Services.register(key) { old_val } + end +end + +# BEGIN RSPEC BOILERPLATE + # This file was generated by the `rspec --init` command. Conventionally, all # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`. # The generated `.rspec` file contains `--require spec_helper` which will cause @@ -115,36 +166,3 @@ # # as the one that triggered the failure. # Kernel.srand config.seed end - -def test_log - "TEST_LOG.txt" -end - -def remove_test_log - FileUtils.rm(CICTL::LogfileDefaults.filepath_of(test_log)) -rescue Errno::ENOENT - # file wasn't there, and that's fine -end - -def solr_count - CICTL::SolrClient.new.count -end - -def solr_deleted_count - CICTL::SolrClient.new.count_deleted -end - -def solr_ids(q = "*:*") - solr_params = {q: q, wt: "ruby", rows: 100} - response = CICTL::SolrClient.new.get("select", params: solr_params) - response["response"]["docs"].map { |doc| doc["id"] } -end - -def override_service(key, &block) - around(:each) do |example| - old_val = HathiTrust::Services[key] - HathiTrust::Services.register(key, &block) - example.run - HathiTrust::Services.register(key) { old_val } - end -end diff --git a/spec/translation_maps/availability_map_ht_intl_spec.rb b/spec/translation_maps/availability_map_ht_intl_spec.rb new file mode 100644 index 0000000..7272bf0 --- /dev/null +++ b/spec/translation_maps/availability_map_ht_intl_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require "spec_helper" +require "ht_traject/ht_constants" + +module AvailabilitynMapHTINTL + TESTS = { + "pd" => [HathiTrust::Constants::FT], # 1 + "ic" => [HathiTrust::Constants::SO], # 2 + "op" => [HathiTrust::Constants::SO], # 3 + "orph" => [HathiTrust::Constants::SO], # 4 + "und" => [HathiTrust::Constants::SO], # 5 + "umall" => [HathiTrust::Constants::SO], # 6 + "ic-world" => [HathiTrust::Constants::FT], # 7 + "nobody" => [HathiTrust::Constants::SO], # 8 + "pdus" => [HathiTrust::Constants::SO], # 9 + "cc-by-3.0" => [HathiTrust::Constants::FT], # 10 + "cc-by-nd-3.0" => [HathiTrust::Constants::FT], # 11 + "cc-by-nc-nd-3.0" => [HathiTrust::Constants::FT], # 12 + "cc-by-nc-3.0" => [HathiTrust::Constants::FT], # 13 + "cc-by-nc-sa-3.0" => [HathiTrust::Constants::FT], # 14 + "cc-by-sa-3.0" => [HathiTrust::Constants::FT], # 15 + "orphcand" => [HathiTrust::Constants::SO], # 16 + "cc-zero" => [HathiTrust::Constants::FT], # 17 + "und-world" => [HathiTrust::Constants::FT], # 18 + "icus" => [HathiTrust::Constants::FT], # 19 + "cc-by-4.0" => [HathiTrust::Constants::FT], # 20 + "cc-by-nd-4.0" => [HathiTrust::Constants::FT], # 21 + "cc-by-nc-nd-4.0" => [HathiTrust::Constants::FT], # 22 + "cc-by-nc-4.0" => [HathiTrust::Constants::FT], # 23 + "cc-by-nc-sa-4.0" => [HathiTrust::Constants::FT], # 24 + "cc-by-sa-4.0" => [HathiTrust::Constants::FT], # 25 + "pd-pvt" => [HathiTrust::Constants::SO], # 26 + "supp" => [HathiTrust::Constants::SO] # 27 + }.freeze + + RSpec.describe AvailabilitynMapHTINTL do + let(:map) { + rb_file = File.expand_path("../../lib/translation_maps/ht/availability_map_ht_intl.rb", __dir__) + # Traject uses `eval` on these files, and alas so must we (apparently). + eval(File.read(rb_file), binding, rb_file) # standard:disable Security/Eval + } + + it "is non-nil" do + expect(map).not_to eq nil + end + + TESTS.each do |input, expected| + it "maps #{input} to #{expected}" do + expect(map[input]).to eq expected + end + end + end +end diff --git a/spec/translation_maps/availability_map_ht_spec.rb b/spec/translation_maps/availability_map_ht_spec.rb new file mode 100644 index 0000000..91dddf4 --- /dev/null +++ b/spec/translation_maps/availability_map_ht_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require "spec_helper" +require "ht_traject/ht_constants" + +module AvailabilitynMapHT + TESTS = { + "pd" => [HathiTrust::Constants::FT], # 1 + "ic" => [HathiTrust::Constants::SO], # 2 + "op" => [HathiTrust::Constants::SO], # 3 + "orph" => [HathiTrust::Constants::SO], # 4 + "und" => [HathiTrust::Constants::SO], # 5 + "umall" => [HathiTrust::Constants::SO], # 6 + "ic-world" => [HathiTrust::Constants::FT], # 7 + "nobody" => [HathiTrust::Constants::SO], # 8 + "pdus" => [HathiTrust::Constants::FT], # 9 + "cc-by-3.0" => [HathiTrust::Constants::FT], # 10 + "cc-by-nd-3.0" => [HathiTrust::Constants::FT], # 11 + "cc-by-nc-nd-3.0" => [HathiTrust::Constants::FT], # 12 + "cc-by-nc-3.0" => [HathiTrust::Constants::FT], # 13 + "cc-by-nc-sa-3.0" => [HathiTrust::Constants::FT], # 14 + "cc-by-sa-3.0" => [HathiTrust::Constants::FT], # 15 + "orphcand" => [HathiTrust::Constants::SO], # 16 + "cc-zero" => [HathiTrust::Constants::FT], # 17 + "und-world" => [HathiTrust::Constants::FT], # 18 + "icus" => [HathiTrust::Constants::SO], # 19 + "cc-by-4.0" => [HathiTrust::Constants::FT], # 20 + "cc-by-nd-4.0" => [HathiTrust::Constants::FT], # 21 + "cc-by-nc-nd-4.0" => [HathiTrust::Constants::FT], # 22 + "cc-by-nc-4.0" => [HathiTrust::Constants::FT], # 23 + "cc-by-nc-sa-4.0" => [HathiTrust::Constants::FT], # 24 + "cc-by-sa-4.0" => [HathiTrust::Constants::FT], # 25 + "pd-pvt" => [HathiTrust::Constants::SO], # 26 + "supp" => [HathiTrust::Constants::SO] # 27 + }.freeze + + RSpec.describe AvailabilitynMapHT do + let(:map) { + rb_file = File.expand_path("../../lib/translation_maps/ht/availability_map_ht.rb", __dir__) + # Traject uses `eval` on these files, and alas so must we (apparently). + eval(File.read(rb_file), binding, rb_file) # standard:disable Security/Eval + } + + it "is non-nil" do + expect(map).not_to eq nil + end + + TESTS.each do |input, expected| + it "maps #{input} to #{expected}" do + expect(map[input]).to eq expected + end + end + end +end