diff --git a/Gemfile b/Gemfile index 1e4b009d..2aa7226a 100644 --- a/Gemfile +++ b/Gemfile @@ -18,17 +18,18 @@ gem 'oai' gem 'ougai', '~> 1.8' gem 'parse-cron' gem 'puma' -gem 'rails', '~> 6.0.3' +gem 'rails', '~> 6.0.3.5' gem 'responders' gem 'sidekiq' gem 'sinatra', require: nil -gem 'supplejack_common', github: 'DigitalNZ/supplejack_common', tag: 'v2.9.0' +gem 'supplejack_common', github: 'DigitalNZ/supplejack_common', tag: 'v2.10.0' gem 'whenever', require: false gem 'brakeman' gem 'moderate_parameters' +gem 'amazing_print' group :test do - gem 'database_cleaner' + gem 'database_cleaner-mongoid' gem 'factory_bot_rails' gem 'rails-controller-testing' gem 'rspec-activemodel-mocks' diff --git a/Gemfile.lock b/Gemfile.lock index d1bd780d..5825b1eb 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,10 +1,10 @@ GIT remote: https://github.com/DigitalNZ/supplejack_common - revision: c2085f9c21bcd1ed5d823ee850b0b6e5d88ee590 - tag: v2.9.0 + revision: 570099ff3b2fb94ba1436eac41cfe1a83880b6f0 + tag: v2.10.0 specs: supplejack_common (0.0.2) - actionpack + actionpack (~> 6.0.3.5) activesupport aws-sdk-s3 chronic (<= 0.10.2) @@ -100,25 +100,26 @@ GEM airbrake-ruby (~> 5.1) airbrake-ruby (5.2.0) rbtree3 (~> 0.5) - ast (2.4.1) + amazing_print (1.2.2) + ast (2.4.2) aws-eventstream (1.1.0) - aws-partitions (1.405.0) - aws-sdk-core (3.110.0) + aws-partitions (1.426.0) + aws-sdk-core (3.112.0) aws-eventstream (~> 1, >= 1.0.2) aws-partitions (~> 1, >= 1.239.0) aws-sigv4 (~> 1.1) jmespath (~> 1.0) - aws-sdk-kms (1.39.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-sdk-kms (1.42.0) + aws-sdk-core (~> 3, >= 3.112.0) aws-sigv4 (~> 1.1) - aws-sdk-s3 (1.86.1) - aws-sdk-core (~> 3, >= 3.109.0) - aws-sdk-kms (~> 1.26) + aws-sdk-s3 (1.88.0) + aws-sdk-core (~> 3, >= 3.112.0) + aws-sdk-kms (~> 1) aws-sigv4 (~> 1.1) aws-sigv4 (1.2.2) aws-eventstream (~> 1, >= 1.0.2) - brakeman (4.10.0) - bson (4.11.1) + brakeman (5.0.0) + bson (4.12.0) builder (3.2.4) byebug (11.1.3) case_transform (0.2) @@ -129,6 +130,13 @@ GEM connection_pool (2.2.3) crass (1.0.6) database_cleaner (1.8.5) + database_cleaner-mongo (1.8.1) + database_cleaner (~> 1.8.0) + mongo + database_cleaner-mongoid (1.8.1) + database_cleaner (~> 1.8.0) + database_cleaner-mongo (~> 1.8.0) + mongoid diff-lcs (1.4.4) dimensions (1.3.0) domain_name (0.5.20190701) @@ -139,12 +147,14 @@ GEM factory_bot_rails (6.1.0) factory_bot (~> 6.1.0) railties (>= 5.0.0) - faraday (1.1.0) + faraday (1.3.0) + faraday-net_http (~> 1.0) multipart-post (>= 1.2, < 3) ruby2_keywords + faraday-net_http (1.0.1) faraday_middleware (1.0.0) faraday (~> 1.0) - ffi (1.13.1) + ffi (1.14.2) figaro (1.2.0) thor (>= 0.14.0, < 2) globalid (0.4.2) @@ -194,7 +204,7 @@ GEM method_source (1.0.0) mime-types (3.3.1) mime-types-data (~> 3.2015) - mime-types-data (3.2020.1104) + mime-types-data (3.2021.0225) mimemagic (0.3.5) mini_mime (1.0.2) mini_portile2 (2.5.0) @@ -224,12 +234,12 @@ GEM builder (>= 3.1.0) faraday faraday_middleware - oj (3.10.16) - ougai (1.8.5) + oj (3.11.2) + ougai (1.9.1) oj (~> 3.10) parallel (1.20.1) parse-cron (0.1.4) - parser (2.7.2.0) + parser (3.0.0.0) ast (~> 2.4.1) pry (0.13.1) coderay (~> 1.1) @@ -239,7 +249,7 @@ GEM pry (~> 0.13.0) pry-rails (0.3.9) pry (>= 0.10.4) - puma (5.1.1) + puma (5.2.1) nio4r (~> 2.0) racc (1.5.2) rack (2.2.3) @@ -284,7 +294,7 @@ GEM ffi (~> 1.0) rbtree3 (0.6.0) redis (4.2.5) - regexp_parser (2.0.0) + regexp_parser (2.0.3) request_store (1.5.0) rack (>= 1.4) responders (3.0.1) @@ -301,64 +311,64 @@ GEM activemodel (>= 3.0) activesupport (>= 3.0) rspec-mocks (>= 2.99, < 4.0) - rspec-core (3.10.0) + rspec-core (3.10.1) rspec-support (~> 3.10.0) - rspec-expectations (3.10.0) + rspec-expectations (3.10.1) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-mocks (3.10.0) + rspec-mocks (3.10.2) diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.10.0) - rspec-rails (4.0.1) + rspec-rails (4.0.2) actionpack (>= 4.2) activesupport (>= 4.2) railties (>= 4.2) - rspec-core (~> 3.9) - rspec-expectations (~> 3.9) - rspec-mocks (~> 3.9) - rspec-support (~> 3.9) + rspec-core (~> 3.10) + rspec-expectations (~> 3.10) + rspec-mocks (~> 3.10) + rspec-support (~> 3.10) rspec-sidekiq (3.1.0) rspec-core (~> 3.0, >= 3.0.0) sidekiq (>= 2.4.0) - rspec-support (3.10.0) - rubocop (1.6.1) + rspec-support (3.10.2) + rubocop (1.9.1) parallel (~> 1.10) - parser (>= 2.7.1.5) + parser (>= 3.0.0.0) rainbow (>= 2.2.2, < 4.0) regexp_parser (>= 1.8, < 3.0) rexml rubocop-ast (>= 1.2.0, < 2.0) ruby-progressbar (~> 1.7) - unicode-display_width (>= 1.4.0, < 2.0) - rubocop-ast (1.3.0) + unicode-display_width (>= 1.4.0, < 3.0) + rubocop-ast (1.4.1) parser (>= 2.7.1.5) rubocop-packaging (0.5.1) rubocop (>= 0.89, < 2.0) - rubocop-performance (1.9.1) + rubocop-performance (1.9.2) rubocop (>= 0.90.0, < 2.0) rubocop-ast (>= 0.4.0) - rubocop-rails (2.9.0) + rubocop-rails (2.9.1) activesupport (>= 4.2.0) rack (>= 1.1) rubocop (>= 0.90.0, < 2.0) - rubocop-rails_config (1.1.1) + rubocop-rails_config (1.2.2) railties (>= 5.0) rubocop (>= 1.0) rubocop-ast (>= 1.0.1) - rubocop-packaging (~> 0.4) + rubocop-packaging (~> 0.5) rubocop-performance (~> 1.3) rubocop-rails (~> 2.0) - rubocop-rspec (2.0.1) + rubocop-rspec (2.2.0) rubocop (~> 1.0) rubocop-ast (>= 1.1.0) - ruby-progressbar (1.10.1) - ruby2_keywords (0.0.2) + ruby-progressbar (1.11.0) + ruby2_keywords (0.0.4) ruby_dep (1.5.0) sanitize (5.2.3) crass (~> 1.0.2) nokogiri (>= 1.8.0) nokogumbo (~> 2.0) - sidekiq (6.1.2) + sidekiq (6.1.3) connection_pool (>= 2.2.2) rack (~> 2.0) redis (>= 4.2.0) @@ -381,13 +391,13 @@ GEM thor (1.1.0) thread_safe (0.3.6) tilt (2.0.10) - timecop (0.9.2) + timecop (0.9.3) tzinfo (1.2.9) thread_safe (~> 0.1) unf (0.1.4) unf_ext unf_ext (0.0.7.7) - unicode-display_width (1.7.0) + unicode-display_width (2.0.0) websocket-driver (0.7.3) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) @@ -403,10 +413,11 @@ DEPENDENCIES active_model_serializers (~> 0.10.7) activeresource airbrake + amazing_print aws-sdk-s3 (~> 1) brakeman chronic - database_cleaner + database_cleaner-mongoid factory_bot_rails figaro kaminari @@ -421,7 +432,7 @@ DEPENDENCIES pry-byebug pry-rails puma - rails (~> 6.0.3) + rails (~> 6.0.3.5) rails-controller-testing responders rspec-activemodel-mocks diff --git a/app/models/abstract_job.rb b/app/models/abstract_job.rb index f755a9eb..9cbe6e08 100644 --- a/app/models/abstract_job.rb +++ b/app/models/abstract_job.rb @@ -9,6 +9,8 @@ class AbstractJob index status: 1, start_time: 1, created_at: 1, updated_at: 1, parser_id: 1 + after_save :check_if_job_should_be_resumed, if: :status_changed? + field :start_time, type: DateTime field :end_time, type: DateTime field :updated_at, type: DateTime @@ -32,6 +34,7 @@ class AbstractJob embeds_many :invalid_records embeds_many :failed_records embeds_one :harvest_failure + embeds_many :states belongs_to :harvest_schedule, optional: true @@ -98,6 +101,7 @@ def required_enrichments state :active state :finished state :failed + state :resume state :stopped event :start do @@ -122,6 +126,16 @@ def required_enrichments transitions to: :finished end + event :resume do + after do + self.records_count = self.posted_records_count + save! + self.enqueue + end + + transitions to: :active + end + event :error do after do self.start_time = Time.zone.now if start_time.blank? @@ -203,4 +217,8 @@ def increment_processed_count! self.processed_count += 1 save! end + + def check_if_job_should_be_resumed + self.resume! if self.status_change.last == 'resume' + end end diff --git a/app/models/harvest_job.rb b/app/models/harvest_job.rb index acc73d22..b346a363 100644 --- a/app/models/harvest_job.rb +++ b/app/models/harvest_job.rb @@ -53,6 +53,16 @@ def records options[:limit] = limit.to_i if limit.to_i.positive? options[:from] = parser.last_harvested_at if incremental? && parser.last_harvested_at + # pass details that are needed for resuming the job ... + options[:job] = self + + if self.states.any? + options[:page] = self.states.last.page + options[:limit] = self.states.last.limit + options[:counter] = self.states.last.counter + options[:base_urls] = self.states.last.base_urls + end + parser.load_file(environment) parser_klass = parser.loader.parser_class parser_klass.environment = environment if environment.present? diff --git a/app/models/state.rb b/app/models/state.rb new file mode 100644 index 00000000..aa9f609f --- /dev/null +++ b/app/models/state.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +# app/models/state.rb +class State + include Mongoid::Document + include Mongoid::Timestamps + + embedded_in :abstract_job + + field :page, type: Integer + field :per_page, type: Integer + field :limit, type: Integer + field :counter, type: Integer + field :base_urls, type: Array +end diff --git a/app/models/supplejack_api/record.rb b/app/models/supplejack_api/record.rb index f45ef245..16e3c00e 100644 --- a/app/models/supplejack_api/record.rb +++ b/app/models/supplejack_api/record.rb @@ -8,8 +8,8 @@ class Record < ActiveResource::Base include Enrichable def self.find(query, page) - Rails.logger.info "query: #{query}" - Rails.logger.info "page: #{page}" + Rails.logger.debug "query: #{query}" + Rails.logger.debug "page: #{page}" super(:all, params: { search: query, search_options: page, api_key: ENV['HARVESTER_API_KEY'] }) rescue ActiveResource::ServerError => e Airbrake.notify(e, error_message: "The api request failed with: query: #{query} and page: #{page}. #{e&.message}") diff --git a/app/workers/abstract_worker.rb b/app/workers/abstract_worker.rb index 8ad0da3e..b31f9a29 100644 --- a/app/workers/abstract_worker.rb +++ b/app/workers/abstract_worker.rb @@ -26,6 +26,7 @@ def sanitize_id(id) def api_update_finished? job.reload + job.posted_records_count == job.records_count end diff --git a/app/workers/enrichment_worker.rb b/app/workers/enrichment_worker.rb index 3d5a1643..e8ab1358 100644 --- a/app/workers/enrichment_worker.rb +++ b/app/workers/enrichment_worker.rb @@ -25,7 +25,12 @@ def perform(enrichment_job_id) enrichment_class.before(job.enrichment) - records = fetch_records(1) + if job.states.any? + records = fetch_records(job.states.last.page.to_i) + else + records = fetch_records(1) + job.states.create!(page: 1) + end while more_records?(records) records.each do |record| @@ -37,6 +42,7 @@ def perform(enrichment_job_id) break if last_page_records?(records) records = fetch_records(records.pagination['page'] + 1) + job.states.create!(page: records.pagination['page']) unless stop_harvest? end until api_update_finished? @@ -46,7 +52,7 @@ def perform(enrichment_job_id) enrichment_class.after(job.enrichment) - job.finish! + job.finish! unless job.stopped? end def more_records?(records) diff --git a/app/workers/harvest_worker.rb b/app/workers/harvest_worker.rb index 9bc3a72b..0fff1731 100644 --- a/app/workers/harvest_worker.rb +++ b/app/workers/harvest_worker.rb @@ -6,6 +6,7 @@ class HarvestWorker < AbstractWorker include Sidekiq::Worker include Matcher::ConceptMatcher + sidekiq_options retry: 5, backtrace: true, queue: 'default' sidekiq_retries_exhausted do |msg| diff --git a/config/environment.rb b/config/environment.rb index d5abe558..426333bb 100644 --- a/config/environment.rb +++ b/config/environment.rb @@ -1,5 +1,3 @@ -# frozen_string_literal: true - # Load the Rails application. require_relative 'application' diff --git a/spec/factories/harvest_jobs.rb b/spec/factories/harvest_jobs.rb index cebbed39..0c5a1f6c 100644 --- a/spec/factories/harvest_jobs.rb +++ b/spec/factories/harvest_jobs.rb @@ -11,5 +11,12 @@ sequence(:user_id) { |n| "abc#{n}" } association :harvest_schedule, factory: :harvest_schedule + + trait :stateful do + after(:create) do |harvest_job| + harvest_job.states << build_list(:state, 3) + harvest_job.save! + end + end end end diff --git a/spec/factories/link_check_rules.rb b/spec/factories/link_check_rules.rb index 93333d21..9ac8bce7 100644 --- a/spec/factories/link_check_rules.rb +++ b/spec/factories/link_check_rules.rb @@ -5,7 +5,7 @@ FactoryBot.define do factory :link_check_rule do sequence(:source_id) { |n| "abc#{n}" } - xpath { '/xpath' } - status_codes { '404' } + xpath { '/xpath' } + status_codes { '404' } end end diff --git a/spec/factories/state.rb b/spec/factories/state.rb new file mode 100644 index 00000000..94c63659 --- /dev/null +++ b/spec/factories/state.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +FactoryBot.define do + factory :state do + page { 1 } + per_page { 10 } + limit { 100 } + counter { 1 } + base_urls { [] } + end +end diff --git a/spec/models/abstract_job_spec.rb b/spec/models/abstract_job_spec.rb index 3b7a01fd..684cd912 100644 --- a/spec/models/abstract_job_spec.rb +++ b/spec/models/abstract_job_spec.rb @@ -428,4 +428,16 @@ expect(job.processed_count).to eq 1 end end + + describe 'check_if_job_should_be_resumed' do + it 'tells a job to resume if it\'s status is set to resume' do + expect(job).to receive(:resume!) + job.update_attributes(status: 'resume') + end + + it 'does not tell a job to resume if it\'s status is not set to resume' do + expect(job).not_to receive(:resume!) + job.update_attributes(status: 'stopped') + end + end end diff --git a/spec/models/harvest_job_spec.rb b/spec/models/harvest_job_spec.rb index 8823d297..8eb36f0a 100644 --- a/spec/models/harvest_job_spec.rb +++ b/spec/models/harvest_job_spec.rb @@ -154,6 +154,30 @@ end end + describe 'resume!' do + let(:resumable_job) { create(:harvest_job, parser_id: '12345', version_id: '666', posted_records_count: 10, records_count: 12) } + + it 'sets the records_count to be the same as the posted_records_count' do + resumable_job.resume! + expect(resumable_job.records_count).to eq resumable_job.posted_records_count + end + + it 'sets the job status to be active' do + resumable_job.resume! + expect(resumable_job.active?).to eq true + end + + it 'saves the job' do + expect(resumable_job).to receive(:save) + resumable_job.resume! + end + + it 'enqueues the job' do + expect(resumable_job).to receive(:enqueue) + resumable_job.resume! + end + end + describe '#full_and_flush_available?' do let(:full_and_flush_job) { create(:harvest_job, parser_id: '12345', version_id: '666', records_count: 1, limit: 0, status: 'running', mode: 'full_and_flush') } @@ -168,4 +192,13 @@ expect(full_and_flush_job_with_no_records.full_and_flush_available?).to eq false end end + + describe '#job_states' do + let(:stateful_job) { create(:harvest_job, :stateful) } + + it 'can have an embedded history of the job state' do + expect(stateful_job.states.count).not_to eq(0) + expect(stateful_job.states.first).to be_a(State) + end + end end diff --git a/spec/models/state_spec.rb b/spec/models/state_spec.rb new file mode 100644 index 00000000..32bf48d8 --- /dev/null +++ b/spec/models/state_spec.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe State do + let(:stateful_job) { create(:harvest_job, :stateful) } + let(:state) { stateful_job.states.first } + + describe '#attributes' do + it 'has the page number' do + expect(state.page).to eq 1 + end + + it 'has the per_page count' do + expect(state.per_page).to eq 10 + end + + it 'has the limit' do + expect(state.limit).to eq 100 + end + + it 'has the counter' do + expect(state.counter).to eq 1 + end + + it 'has the base_urls' do + expect(state.base_urls).to eq [] + end + end +end