Skip to content

Commit

Permalink
Merge branch 'main' into rm/activity
Browse files Browse the repository at this point in the history
  • Loading branch information
richardmatthewsdev committed Aug 30, 2024
2 parents f8b2b6c + 29c70aa commit e19312c
Show file tree
Hide file tree
Showing 167 changed files with 1,930 additions and 139 deletions.
4 changes: 2 additions & 2 deletions app/models/extraction_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def create_folder
def delete_folder
return unless Dir.exist?(extraction_folder)

FileUtils.rm Dir.glob("#{extraction_folder}/*")
FileUtils.rm_rf Dir.glob("#{extraction_folder}/*")
Dir.rmdir(extraction_folder)
end

Expand All @@ -63,6 +63,6 @@ def documents
#
# @return Integer
def extraction_folder_size_in_bytes
Dir.glob("#{extraction_folder}/*").sum { |f| File.size(f) }
Dir.glob("#{extraction_folder}/**/*.*").sum { |f| File.size(f) }
end
end
10 changes: 6 additions & 4 deletions app/sidekiq/file_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ def setup_tmp_directory
end

def move_extracted_documents_into_tmp_directory
Dir.children(@extraction_folder).each do |file|
next if file == 'tmp'

FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
Dir.children(@extraction_folder).reject { |f| f.ends_with?('tmp') }.each do |folder|
FileUtils.move("#{@extraction_folder}/#{folder}", @tmp_directory)
end
end

Expand All @@ -85,4 +83,8 @@ def process_extracted_documents
def create_document
raise 'create_document not defined in child class'
end

def folder_number(page = 1)
(page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil
end
end
14 changes: 8 additions & 6 deletions app/sidekiq/split_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

class SplitWorker < FileExtractionWorker
def process_extracted_documents
Dir.children(@tmp_directory).each do |file|
saved_response = JSON.parse(File.read("#{@tmp_directory}/#{file}"))
Dir.children(@tmp_directory).each do |folder|
Dir.children("#{@tmp_directory}/#{folder}").each do |file|
saved_response = JSON.parse(File.read("#{@tmp_directory}/#{folder}/#{file}"))

Nokogiri::XML(saved_response['body']).xpath(@extraction_definition.split_selector).each_slice(100) do |records|
create_document(records, saved_response)
@page += 1
Nokogiri::XML(saved_response['body']).xpath(@extraction_definition.split_selector).each_slice(100) do |records|
create_document(records, saved_response)
@page += 1
end
end
end
end
Expand All @@ -21,6 +23,6 @@ def create_document(records, saved_response)
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: "<?xml version=\"1.0\"?><root><records>#{records.map(&:to_xml).join}</records></root>"
).save("#{@extraction_folder}/#{name_str}__-__#{page_str}.json")
).save("#{@extraction_folder}/#{folder_number(@page)}/#{name_str}__-__#{page_str}.json")
end
end
33 changes: 18 additions & 15 deletions app/sidekiq/text_extraction_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,55 @@

class TextExtractionWorker < FileExtractionWorker
def process_extracted_documents
Dir.children(@tmp_directory).each do |file|
loaded_file = File.read("#{@tmp_directory}/#{file}")

extracted_text = extract_text(loaded_file, file)
create_document(extracted_text[:text], file, extracted_text[:process])
@page += 1
Dir.children(@tmp_directory).each do |folder|
Dir.children("#{@tmp_directory}/#{folder}").each do |file|
loaded_file = File.read("#{@tmp_directory}/#{folder}/#{file}")

extracted_text = extract_text(loaded_file, file, folder)
filepath = "#{@extraction_folder}/#{folder}/#{file}"
create_document(extracted_text[:text], filepath, extracted_text[:process])
@page += 1
end
end

@extraction_definition.update(format: 'JSON')
end

def create_document(extracted_text, filename, process)
def create_document(extracted_text, filepath, process)
Extraction::Document.new(
url: saved_response['url'], method: saved_response['method'],
params: saved_response['params'], request_headers: saved_response['request_headers'],
status: saved_response['status'], response_headers: saved_response['response_headers'],
body: { text: extracted_text, process: }.to_json
).save("#{@extraction_folder}/#{filename}")
).save(filepath)
end

private

def extract_text(loaded_file, file)
def extract_text(loaded_file, file, folder)
mimetype = Marcel::MimeType.for(loaded_file)

text = Yomu.read(:text, loaded_file)
process = "Extracted from #{mimetype} using Yomu"

if text.squish.empty? && mimetype == 'application/pdf'
text = ocr_pdf(file)
text = ocr_pdf(file, folder)
process = 'Extracted from PDF using OCRmyPDF'
end

{ text:, process: }
end

def ocr_pdf(file)
def ocr_pdf(file, folder)
base_file_name = File.basename(file, File.extname(file))

`ocrmypdf \
"#{@tmp_directory.shellescape}/#{file.shellescape}" \
--sidecar "#{@tmp_directory.shellescape}/#{base_file_name.shellescape}.txt" - \
"#{@tmp_directory.shellescape}/#{folder.shellescape}/#{file.shellescape}" \
--sidecar "#{@tmp_directory.shellescape}/#{folder.shellescape}/#{base_file_name.shellescape}.txt" - \
--redo-ocr --output-type=none -q`

if File.exist?("#{@tmp_directory}/#{base_file_name}.txt")
File.read("#{@tmp_directory}/#{base_file_name}.txt")
if File.exist?("#{@tmp_directory}/#{folder}/#{base_file_name}.txt")
File.read("#{@tmp_directory}/#{folder}/#{base_file_name}.txt")
else
'OCR failed'
end
Expand Down
4 changes: 4 additions & 0 deletions app/supplejack/extraction/abstract_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def headers
}
end

def folder_number(page = 1)
(page / Extraction::Documents::DOCUMENTS_PER_FOLDER.to_f).ceil
end

def http_method
return 'get' if @request.nil?

Expand Down
4 changes: 2 additions & 2 deletions app/supplejack/extraction/archive_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def download_archive

def save_entries(extraction_folder)
each_entry(extraction_folder) do |_, name|
current_document = build_doc("#{extraction_folder}/#{name}")
current_document = build_doc("#{extraction_folder}/#{folder_number(extraction_definition.page)}/#{name}")
save(current_document)
next_page
end
Expand Down Expand Up @@ -76,7 +76,7 @@ def save(doc)
def file_path
page_str = format('%09d', extraction_definition.page)[-9..]
name_str = extraction_definition.name.parameterize(separator: '_')
"#{@extraction_folder}/#{name_str}__-__#{page_str}.json"
"#{@extraction_folder}/#{folder_number(extraction_definition.page)}/#{name_str}__-__#{page_str}.json"
end

def url
Expand Down
3 changes: 3 additions & 0 deletions app/supplejack/extraction/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def successful?
end

def save(file_path)
# Create the directory if it doesn't exist
FileUtils.mkdir_p(File.dirname(file_path))

File.write(file_path, to_json)
# If the file fails to be converted to a JSON document
# write the original file to the filepath as a binary
Expand Down
2 changes: 1 addition & 1 deletion app/supplejack/extraction/document_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def extract
def file_path
page_str = format('%09d', @extraction_definition.page)[-9..]
name_str = @extraction_definition.name.parameterize(separator: '_')
"#{@extraction_folder}/#{name_str}__-__#{page_str}.json"
"#{@extraction_folder}/#{folder_number(@extraction_definition.page)}/#{name_str}__-__#{page_str}.json"
end

def url
Expand Down
28 changes: 16 additions & 12 deletions app/supplejack/extraction/documents.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
module Extraction
# Currently used for the view to generate kaminari paginations
class Documents
DOCUMENTS_PER_FOLDER = 100
attr_reader :current_page, :per_page, :limit_value

def initialize(folder)
Expand All @@ -17,28 +18,31 @@ def initialize(folder)

def [](key)
@current_page = key&.to_i || 1
return nil unless in_bounds?(@current_page)
return nil if documents_filepath.blank?

@documents[@current_page] ||= Document.load_from_file(documents_filepath[@current_page - 1])
@documents[@current_page] = Document.load_from_file(documents_filepath)
end

def total_pages
documents_filepath.length
return 0 if total_folders.zero?

((total_folders - 1) * DOCUMENTS_PER_FOLDER) + Dir.glob("#{@folder}/#{total_folders}/*").count
end

def total_folders
Dir.children(@folder).count { |f| !f.ends_with?('tmp') }
end

private

def in_bounds?(current_page)
current_page.in?(1..documents_filepath.length)
def documents_filepath
folder_number = folder_number(@current_page)
page_number = format('%09d', @current_page)[-9..]
@documents_filepath = Dir.glob("#{@folder}/#{folder_number}/*__#{page_number}.json").first
end

# The enrichments rely on the files being ordered by page number
# so that the index [2005] gives back page 2005 etc.
# If the pages and indexes do not match up, records will be enriched with data that is not meant for them
def documents_filepath
@documents_filepath ||= Dir.glob("#{@folder}/*.json").sort_by do |page|
page.match(/__(?<record_id>.+)__(?<page>.+).json/)[:page].to_i
end
def folder_number(page = 1)
(page / DOCUMENTS_PER_FOLDER.to_f).ceil
end
end
end
2 changes: 1 addition & 1 deletion app/supplejack/extraction/enrichment_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def headers
def file_path
name_str = @extraction_definition.name.parameterize(separator: '_')
page_str = format('%09d', @page)[-9..]
"#{@extraction_folder}/#{name_str}__#{@record['id']}__#{page_str}.json"
"#{@extraction_folder}/#{folder_number(@page)}/#{name_str}__#{@record['id']}__#{page_str}.json"
end

def fragment_url
Expand Down
2 changes: 1 addition & 1 deletion lib/tasks/fixtures.rake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ namespace :fixtures do
desc 'Clears the extraction folder, loads the fixtures and launches the job for testing'
task load: :environment do
puts 'Clearing the extraction folder...'
FileUtils.rm Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/*")
FileUtils.rm_rf Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*/*")
FileUtils.rmdir Dir.glob("#{ExtractionJob::EXTRACTIONS_FOLDER}/*")

puts 'Resetting the DB...'
Expand Down
58 changes: 31 additions & 27 deletions spec/sidekiq/split_worker_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,73 +11,77 @@
describe '#perform' do
context 'when the request is not paginated' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example.json")
end

it 'splits a large file into chunks of 100' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 1

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2
end

it 'cleans up the tmp folder it creates' do
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
expect(Dir.exist?("#{extraction_job.extraction_folder}/tmp")).to eq false
end

it 'names the new files following the appropriate naming convention' do
SplitWorker.new.perform(extraction_job.id)
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))

expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000001.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000002.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000003.json"))
expect(File.exist?("#{extraction_job.extraction_folder}/1/national-library-of-new-zealand_harvest-extraction-210__-__000000004.json"))
end
end

context 'when the request is paginated' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example_02.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_01.json")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example_02.json")
end

it 'splits both files into many small ones that are unique from each other' do
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 2

SplitWorker.new.perform(extraction_job.id)
extracted_files = Dir.glob("#{extraction_job.extraction_folder}/*").select { |e| File.file? e }

extracted_files = Dir.glob("#{extraction_job.extraction_folder}/**/*").select { |e| File.file? e }

expect(extracted_files.count).to eq 4
end
end

context 'when the split is not part of a harvest' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example.json")
end

it 'does not enqueue Transformation Workers' do
expect(TransformationWorker).not_to receive(:perform_async)

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end
end

context 'when the split is part of a harvest' do
before do
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/split_example.json")
FileUtils.mkdir_p("#{extraction_job.extraction_folder}/1")
FileUtils.cp("#{Rails.root}/spec/support/split_example.json", "#{extraction_job.extraction_folder}/1/split_example.json")
end

let!(:harvest_report) { create(:harvest_report, pipeline_job:, harvest_job:) }
Expand All @@ -90,7 +94,7 @@
it 'enqueues Transformation Workers to process the split files' do
expect(TransformationWorker).to receive(:perform_async).exactly(2).times.and_call_original

SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)
end

it 'stops enqueuing TransformationWorkers if the pipeline has been cancelled' do
Expand All @@ -101,7 +105,7 @@
end

it 'updates the Harvest Report appropriately' do
SplitWorker.new.perform(extraction_job.id)
SplitWorker.new.perform(extraction_job.id)

harvest_report.reload

Expand Down
Loading

0 comments on commit e19312c

Please sign in to comment.