.+)__/)[:record_id]
+ end
+
+ def reset_harvest_report(harvest_report)
+ harvest_report.transformation_queued!
+ harvest_report.load_queued!
+ end
+
+ def setup_tmp_directory
+ return if Dir.exist?(@tmp_directory)
+
+ Dir.mkdir(@tmp_directory)
+ end
+
+ def move_extracted_documents_into_tmp_directory
+ Dir.children(@extraction_folder).each do |file|
+ next if file == 'tmp'
+
+ FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
+ end
+ end
+
+ def process_extracted_documents
+ raise 'process_extracted_documents not defined in child class'
+ end
+
+ def create_document
+ raise 'create_document not defined in child class'
+ end
+end
diff --git a/app/sidekiq/split_worker.rb b/app/sidekiq/split_worker.rb
index c6e85440..881b5867 100644
--- a/app/sidekiq/split_worker.rb
+++ b/app/sidekiq/split_worker.rb
@@ -1,63 +1,6 @@
# frozen_string_literal: true
-class SplitWorker
- include Sidekiq::Job
-
- sidekiq_options retry: 0
-
- def perform(extraction_job_id)
- @extraction_job = ExtractionJob.find(extraction_job_id)
- @extraction_definition = @extraction_job.extraction_definition
- @extraction_folder = @extraction_job.extraction_folder
- @tmp_directory = "#{@extraction_folder}/tmp"
- @page = 1
-
- setup_tmp_directory
- move_extracted_documents_into_tmp_directory
- process_extracted_documents
-
- FileUtils.remove_dir(@tmp_directory)
-
- create_transformation_jobs if @extraction_job.harvest_job.present?
- end
-
- private
-
- def create_transformation_jobs
- harvest_report = @extraction_job.harvest_job.harvest_report
- pipeline_job = harvest_report.pipeline_job
-
- reset_harvest_report(harvest_report)
-
- (@extraction_job.extraction_definition.page..@extraction_job.documents.total_pages).each do |page|
- harvest_report.increment_pages_extracted!
- TransformationWorker.perform_async(@extraction_job.harvest_job.id, page)
- harvest_report.increment_transformation_workers_queued!
-
- pipeline_job.reload
- break if pipeline_job.cancelled?
- end
- end
-
- def reset_harvest_report(harvest_report)
- harvest_report.update(pages_extracted: 0)
- harvest_report.extraction_completed!
- end
-
- def setup_tmp_directory
- return if Dir.exist?(@tmp_directory)
-
- Dir.mkdir(@tmp_directory)
- end
-
- def move_extracted_documents_into_tmp_directory
- Dir.children(@extraction_folder).each do |file|
- next if file == 'tmp'
-
- FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}")
- end
- end
-
+class SplitWorker < FileExtractionWorker
def process_extracted_documents
Dir.children(@tmp_directory).each do |file|
saved_response = JSON.parse(File.read("#{@tmp_directory}/#{file}"))
diff --git a/app/sidekiq/text_extraction_worker.rb b/app/sidekiq/text_extraction_worker.rb
new file mode 100644
index 00000000..a8a55173
--- /dev/null
+++ b/app/sidekiq/text_extraction_worker.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+class TextExtractionWorker < FileExtractionWorker
+ def process_extracted_documents
+ Dir.children(@tmp_directory).each do |file|
+ saved_file = File.read("#{@tmp_directory}/#{file}")
+
+ saved_response = { 'method' => 'GET', 'status' => 200, 'response_headers' => [], 'request_headers' => [] }
+
+ create_document(Yomu.read(:text, saved_file), saved_response, file)
+ @page += 1
+ end
+
+ @extraction_definition.update(format: 'JSON')
+ end
+
+ def create_document(extracted_text, saved_response, filename)
+ Extraction::Document.new(
+ url: saved_response['url'], method: saved_response['method'],
+ params: saved_response['params'], request_headers: saved_response['request_headers'],
+ status: saved_response['status'], response_headers: saved_response['response_headers'],
+ body: { text: extracted_text }.to_json
+ ).save("#{@extraction_folder}/#{filename}")
+ end
+end
diff --git a/app/supplejack/extraction/document.rb b/app/supplejack/extraction/document.rb
index 1c284189..7c2a7ee7 100644
--- a/app/supplejack/extraction/document.rb
+++ b/app/supplejack/extraction/document.rb
@@ -23,6 +23,11 @@ def successful?
def save(file_path)
File.write(file_path, to_json)
+ # If the file fails to be converted to a JSON document
+ # write the original file to the filepath as a binary
+ # It is probably a PDF or Word Doc
+ rescue JSON::GeneratorError
+ File.write(file_path, @body, mode: 'wb')
end
def size_in_bytes
@@ -35,6 +40,8 @@ def self.load_from_file(file_path)
Rails.logger.debug { "Loading document #{file_path}" }
json = JSON.parse(File.read(file_path)).symbolize_keys
Document.new(file_path, **json)
+ rescue JSON::ParserError
+ {}
end
def to_hash
diff --git a/app/supplejack/extraction/enrichment_execution.rb b/app/supplejack/extraction/enrichment_execution.rb
index 7b370370..0bf3a2e9 100644
--- a/app/supplejack/extraction/enrichment_execution.rb
+++ b/app/supplejack/extraction/enrichment_execution.rb
@@ -58,6 +58,7 @@ def new_enrichment_extraction(api_record, page)
def enqueue_record_transformation(api_record, document, page)
return unless @harvest_job.present? && document.successful?
+ return if @extraction_definition.extract_text_from_file?
TransformationWorker.perform_async(@harvest_job.id, page, api_record['id'])
@harvest_report.increment_transformation_workers_queued! if @harvest_report.present?
diff --git a/app/supplejack/extraction/execution.rb b/app/supplejack/extraction/execution.rb
index a5ccf37d..e0231203 100644
--- a/app/supplejack/extraction/execution.rb
+++ b/app/supplejack/extraction/execution.rb
@@ -79,10 +79,15 @@ def extract_and_save_document(request)
end
def enqueue_record_transformation
- return unless @harvest_job.present? && @de.document.successful? && !@extraction_definition.split
+ return unless @harvest_job.present? && @de.document.successful?
+ return if requires_additional_processing?
TransformationWorker.perform_async(@harvest_job.id, @extraction_definition.page)
@harvest_report.increment_transformation_workers_queued!
end
+
+ def requires_additional_processing?
+ @extraction_definition.split? || @extraction_definition.extract_text_from_file?
+ end
end
end
diff --git a/app/supplejack/transformation/transformed_record.rb b/app/supplejack/transformation/transformed_record.rb
index a72fbe12..f5c10d64 100644
--- a/app/supplejack/transformation/transformed_record.rb
+++ b/app/supplejack/transformation/transformed_record.rb
@@ -23,14 +23,8 @@ def errors
end
end
- def rejection_reasons
- @reject_fields.each_with_object([]) do |field, reasons|
- reasons.push(field.name) if field.value == true
- end
- end
-
- def deletion_reasons
- @delete_fields.each_with_object([]) do |field, reasons|
+ def reasons(fields)
+ fields.each_with_object([]) do |field, reasons|
reasons.push(field.name) if field.value == true
end
end
@@ -39,8 +33,8 @@ def to_hash
{
'transformed_record' => transformed_record,
'errors' => errors,
- 'rejection_reasons' => rejection_reasons,
- 'deletion_reasons' => deletion_reasons
+ 'rejection_reasons' => reasons(@reject_fields),
+ 'deletion_reasons' => reasons(@delete_fields)
}
end
end
diff --git a/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb b/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb
index bf014d32..d1d8034b 100644
--- a/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb
+++ b/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb
@@ -84,8 +84,28 @@
- <%= form.select :destination_id, options_from_collection_for_select(@destinations, 'id', 'name'), {},
- class: 'form-select' %>
+ <%= form.select(:destination_id,
+ options_from_collection_for_select(
+ @destinations, 'id', 'name', @extraction_definition&.destination&.id
+ ),
+ {}, class: 'form-select') %>
+
+
+
+ <%= form.label :extract_text_from_file, class: 'form-label' do %>
+ Extract text from file
+
+
+
+ <% end %>
+
+
+
+ <%= form.select :extract_text_from_file, options_for_select(
+ [%w[No false], %w[Yes true]], model.extract_text_from_file
+ ), {}, class: 'form-select' %>
diff --git a/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb b/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb
index 15d23b7e..392203a6 100644
--- a/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb
+++ b/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb
@@ -98,6 +98,23 @@
), {}, class: 'form-select' %>
+
+ <%= form.label :extract_text_from_file, class: 'form-label' do %>
+ Extract text from file
+
+
+
+ <% end %>
+
+
+
+ <%= form.select :extract_text_from_file, options_for_select(
+ [%w[No false], %w[Yes true]], model.extract_text_from_file
+ ), {}, class: 'form-select' %>
+
+