diff --git a/Gemfile b/Gemfile index 7c9adb75..3f3caeb6 100644 --- a/Gemfile +++ b/Gemfile @@ -42,9 +42,11 @@ gem 'rqrcode' gem 'faraday', '~> 2.9' gem 'faraday-follow_redirects' gem 'jsonpath' +gem 'minitar' gem 'nokogiri' gem 'sidekiq' gem 'yomu' +gem 'zlib' # transformation related gem 'webmock' diff --git a/Gemfile.lock b/Gemfile.lock index 67f3e5cf..77988e4b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -243,7 +243,8 @@ GEM mime-types-data (~> 3.2015) mime-types-data (3.2024.0507) mini_mime (1.1.5) - minitest (5.22.3) + minitar (0.9) + minitest (5.19.0) multi_json (1.15.0) mutex_m (0.2.0) mysql2 (0.5.6) @@ -465,7 +466,8 @@ GEM nokogiri (~> 1.8) yard (0.9.36) yomu (0.1.5) - zeitwerk (2.6.13) + zeitwerk (2.6.16) + zlib (3.1.1) PLATFORMS aarch64-linux-musl @@ -493,6 +495,7 @@ DEPENDENCIES jsonpath kaminari letter_opener + minitar mysql2 (~> 0.5) nokogiri pry-byebug @@ -517,6 +520,7 @@ DEPENDENCIES webmock yard yomu + zlib RUBY VERSION ruby 3.2.2p53 diff --git a/app/controllers/requests_controller.rb b/app/controllers/requests_controller.rb index 43572b04..63922478 100644 --- a/app/controllers/requests_controller.rb +++ b/app/controllers/requests_controller.rb @@ -36,9 +36,11 @@ def harvest_request @previous_response = Extraction::DocumentExtraction.new(@previous_request).extract end + document = Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract + document.body = "Tar files can't be displayed" if @extraction_definition.format == 'ARCHIVE_JSON' render json: @request.to_h.merge( - preview: Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract + preview: document ) end diff --git a/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx b/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx index 9190ac89..b2e395f9 100644 --- a/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx +++ b/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx @@ -9,7 +9,7 @@ const RecordViewer = ({ record, format }) => { const editor = useRef(); function doc() { - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { return JSON.stringify(record, null, 2); } else if (format == "XML") { return xmlFormat(record, { indentation: " ", lineSeparator: "\n" }); diff --git a/app/frontend/js/components/CodeEditor.jsx b/app/frontend/js/components/CodeEditor.jsx index a9e7aea8..794352a9 100644 --- a/app/frontend/js/components/CodeEditor.jsx +++ b/app/frontend/js/components/CodeEditor.jsx @@ -11,7 +11,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => { const editorRef = useRef(); const editorExtensions = () => { - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { return [basicSetup, json(), EditorState.readOnly.of(true)]; } else if (format == "XML" || format == "HTML") { return [ @@ -39,7 +39,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => { const doc = () => { let content; - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { try { content = JSON.stringify(JSON.parse(initContent), null, 2); } catch (err) { diff --git a/app/frontend/js/editor.js b/app/frontend/js/editor.js index dbe25f72..4f427b8c 100644 --- a/app/frontend/js/editor.js +++ b/app/frontend/js/editor.js @@ -8,7 +8,7 @@ import { ruby } from "@codemirror/legacy-modes/mode/ruby"; import { xml } from "@codemirror/legacy-modes/mode/xml"; export function editorExtensions(format, readOnly, formField) { - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { return [basicSetup, json(), EditorState.readOnly.of(readOnly)]; } else if (format == "XML" || format == "HTML") { return [ @@ -52,7 +52,7 @@ if (extractionResultViewer) { const format = extractionResultViewer.dataset.format; let results = extractionResultViewer.dataset.results; - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { results = JSON.stringify(JSON.parse(results), null, 2); } else if (format == "XML") { results = xmlFormat(results, { indentation: " ", lineSeparator: "\n" }); diff --git a/app/frontend/js/modals/transformationDefinitionSettingsModal.js b/app/frontend/js/modals/transformationDefinitionSettingsModal.js index 09484e61..5d69a258 100644 --- a/app/frontend/js/modals/transformationDefinitionSettingsModal.js +++ b/app/frontend/js/modals/transformationDefinitionSettingsModal.js @@ -75,7 +75,7 @@ function displayPreview(data) { } = data; try { - if (format == "JSON") { + if (format == "JSON" || format == "ARCHIVE_JSON") { result = JSON.stringify(JSON.parse(result), null, 2); } else if (format == "XML") { result = xmlFormat(result, { @@ -145,7 +145,10 @@ function bindRecordSelectorTestEventListeners(id) { (response, _alertClass) => { let results = response.data.result; - if (response.data.format == "JSON") { + if ( + response.data.format == "JSON" || + response.data.format == "ARCHIVE_JSON" + ) { results = JSON.stringify(response.data.result, null, 2); } else if (response.data.format == "XML") { results = xmlFormat(response.data.result, { diff --git a/app/models/extraction_definition.rb b/app/models/extraction_definition.rb index 4b7e53e5..1229e4e7 100644 --- a/app/models/extraction_definition.rb +++ b/app/models/extraction_definition.rb @@ -3,7 +3,7 @@ # Used to store the information for running an extraction # class ExtractionDefinition < ApplicationRecord - FORMATS = %w[JSON XML HTML].freeze + FORMATS = %w[JSON XML HTML ARCHIVE_JSON].freeze # The destination is used for Enrichment Extractions # To know where to pull the records that are to be enriched from @@ -59,6 +59,10 @@ def to_h } end + def json? + format.in? %w[JSON ARCHIVE_JSON] + end + def shared? @shared = harvest_definitions.count > 1 if @shared.nil? @shared diff --git a/app/models/extraction_job.rb b/app/models/extraction_job.rb index a73777ea..e34830c2 100644 --- a/app/models/extraction_job.rb +++ b/app/models/extraction_job.rb @@ -23,6 +23,7 @@ class ExtractionJob < ApplicationRecord end delegate :format, to: :extraction_definition + delegate :json?, to: :extraction_definition # Returns the fullpath to the extraction folder for this job # diff --git a/app/supplejack/extraction/archive.rb b/app/supplejack/extraction/archive.rb new file mode 100644 index 00000000..50d2e87e --- /dev/null +++ b/app/supplejack/extraction/archive.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +require 'archive/tar/minitar' + +module Extraction + class Archive + def self.gzipped?(gz_file_path) + File.open(gz_file_path, 'rb') do |file| + magic_number = file.read(2).unpack('C2') + return magic_number == [0x1F, 0x8B] + end + end + + def self.extract_from_gz(gz_path) + Zlib::GzipReader.open(gz_path, &:read) + end + + def self.top_level_entry?(full_name) + count = full_name.count('/') + return true if count.zero? + return true if count == 1 && full_name[-1] == '/' + + false + end + + def self.list_top_level_entries(input) + top_level_entries = [] + input.each do |entry| + full_name = entry.full_name + top_level_entries << full_name if Archive.top_level_entry?(full_name) + end + + top_level_entries + end + + def self.body(extracted_file_path) + body = if Archive.gzipped?(extracted_file_path) + Archive.extract_from_gz(extracted_file_path) + else + File.read(extracted_file_path) + end + + File.delete(extracted_file_path) + body + end + end +end diff --git a/app/supplejack/extraction/archive_extraction.rb b/app/supplejack/extraction/archive_extraction.rb new file mode 100644 index 00000000..18cabb7f --- /dev/null +++ b/app/supplejack/extraction/archive_extraction.rb @@ -0,0 +1,96 @@ +# frozen_string_literal: true + +module Extraction + class ArchiveExtraction < AbstractExtraction + def initialize(request, extraction_folder = nil, response = nil) + super() + @request = request + @extraction_folder = extraction_folder + @response = response + @document = nil + end + + def download_archive + extract + end + + def save_entries(extraction_folder) + each_entry(extraction_folder) do |_, name| + current_document = build_doc("#{extraction_folder}/#{name}") + save(current_document) + next_page + end + ensure + clear_extracted_archive(extraction_folder) + end + + private + + def each_entry(extraction_folder) + Minitar::Input.open(StringIO.new(@document.body)) do |input| + input.each do |entry| + input.extract_entry(extraction_folder, entry) do |action, name, stats| + next if action != :file_done + + yield(action, name, stats) + end + end + end + end + + def clear_extracted_archive(extraction_folder) + top_level_entries = [] + Minitar::Input.open(StringIO.new(@document.body)) do |input| + top_level_entries = Archive.list_top_level_entries(input) + end + FileUtils.rm_rf(top_level_entries.compact_blank.map { |file| "#{extraction_folder}/#{file}" }) + end + + def build_doc(entry_path) + Document.new( + url: @document.url, + method: @document.method, + params: @document.params, + request_headers: @document.request_headers, + status: @document.status, + response_headers: @document.response_headers, + body: Archive.body(entry_path) + ) + end + + def extraction_definition + @request.extraction_definition + end + + def next_page + extraction_definition.page += 1 + end + + def save(doc) + raise ArgumentError, 'extraction_folder was not provided in #new' if @extraction_folder.blank? + raise '#extract must be called before #save AbstractExtraction' if @document.blank? + + doc.save(file_path) + end + + def file_path + page_str = format('%09d', extraction_definition.page)[-9..] + name_str = extraction_definition.name.parameterize(separator: '_') + "#{@extraction_folder}/#{name_str}__-__#{page_str}.json" + end + + def url + @request.url(@response) + end + + def params + @request.query_parameters(@response) + end + + def headers + return super if @request.headers.blank? + + super.merge(@request.headers(@response)) + end + end +end diff --git a/app/supplejack/extraction/document.rb b/app/supplejack/extraction/document.rb index 50f16b29..0a48096b 100644 --- a/app/supplejack/extraction/document.rb +++ b/app/supplejack/extraction/document.rb @@ -4,7 +4,8 @@ module Extraction # Manages the filesystem part of the request object # Saves it to filesystem and loads it in memory class Document - attr_reader :status, :request_headers, :response_headers, :body, :url, :method, :params, :file_path + attr_reader :status, :request_headers, :response_headers, :url, :method, :params, :file_path + attr_accessor :body def initialize(file_path = nil, **kwargs) @file_path = file_path diff --git a/app/supplejack/extraction/execution.rb b/app/supplejack/extraction/execution.rb index e0231203..9debcc0d 100644 --- a/app/supplejack/extraction/execution.rb +++ b/app/supplejack/extraction/execution.rb @@ -1,5 +1,8 @@ # frozen_string_literal: true +require 'zlib' +require 'archive/tar/minitar' + module Extraction # Performs the work as defined in the document extraction class Execution @@ -11,24 +14,38 @@ def initialize(job, extraction_definition) end def call - extract_and_save_document(@extraction_definition.requests.first) + extract(@extraction_definition.requests.first) return if @extraction_job.is_sample? || set_number_reached? return unless @extraction_definition.paginated? loop do - @extraction_definition.page += 1 - - extract_and_save_document(@extraction_definition.requests.last) - + next_page + extract(@extraction_definition.requests.last) throttle - break if @extraction_job.reload.cancelled? + break if execution_cancelled? break if stop_condition_met? end end private + def extract(request) + if @extraction_definition.format == 'ARCHIVE_JSON' + extract_archive_and_save(request) + else + extract_and_save_document(request) + end + end + + def next_page + @extraction_definition.page += 1 + end + + def execution_cancelled? + @extraction_job.reload.cancelled? + end + def stop_condition_met? [set_number_reached?, extraction_failed?, duplicate_document_extracted?, custom_stop_conditions_met?].any?(true) end @@ -53,15 +70,23 @@ def duplicate_document_extracted? end def custom_stop_conditions_met? - return false if @extraction_definition.stop_conditions.empty? + stop_conditions = @extraction_definition.stop_conditions + return false if stop_conditions.empty? - @extraction_definition.stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true) + stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true) end def throttle sleep @extraction_definition.throttle / 1000.0 end + def extract_archive_and_save(request) + extraction_folder = @extraction_job.extraction_folder + @de = ArchiveExtraction.new(request, extraction_folder, @previous_request) + @de.download_archive + @de.save_entries(extraction_folder) + end + def extract_and_save_document(request) @de = DocumentExtraction.new(request, @extraction_job.extraction_folder, @previous_request) @previous_request = @de.extract diff --git a/app/supplejack/transformation/raw_records_extractor.rb b/app/supplejack/transformation/raw_records_extractor.rb index d3b2c8e0..5bf7d89c 100644 --- a/app/supplejack/transformation/raw_records_extractor.rb +++ b/app/supplejack/transformation/raw_records_extractor.rb @@ -39,12 +39,14 @@ def json_extract(page) end def format + return 'JSON' if @extraction_job.extraction_definition.format == 'ARCHIVE_JSON' + @extraction_job.extraction_definition.format end def record_selector return @transformation_definition.record_selector if @transformation_definition.record_selector.present? - return '*' if format == 'JSON' + return '*' if format.in?(%w[JSON ARCHIVE_JSON]) '/' end diff --git a/app/views/transformation_definitions/_create_edit_modal.html.erb b/app/views/transformation_definitions/_create_edit_modal.html.erb index 4038977c..1367dc64 100644 --- a/app/views/transformation_definitions/_create_edit_modal.html.erb +++ b/app/views/transformation_definitions/_create_edit_modal.html.erb @@ -116,7 +116,7 @@ + <% if model.extraction_job.json? %> data-result="<%= model.records.first.to_json %>" <% else %> data-result="<%= model.records.first %>" @@ -129,7 +129,7 @@ + <% if extraction_job.json? %> <%# TODO: load this with Ajax? %> data-result="<%= TransformationDefinition.new(extraction_job:).records.first.to_json %>" <% else %>