DigitalNZ · paul-mesnilgrente · Jul 15, 2024 · Jun 19, 2024 · Jun 26, 2024 · Jul 9, 2024
diff --git a/Gemfile b/Gemfile
@@ -42,9 +42,11 @@ gem 'rqrcode'
 gem 'faraday', '~> 2.9'
 gem 'faraday-follow_redirects'
 gem 'jsonpath'
+gem 'minitar'
 gem 'nokogiri'
 gem 'sidekiq'
 gem 'yomu'
+gem 'zlib'
 
 # transformation related
 gem 'webmock'

diff --git a/Gemfile.lock b/Gemfile.lock
@@ -243,7 +243,8 @@ GEM
       mime-types-data (~> 3.2015)
     mime-types-data (3.2024.0507)
     mini_mime (1.1.5)
-    minitest (5.22.3)
+    minitar (0.9)
+    minitest (5.19.0)
     multi_json (1.15.0)
     mutex_m (0.2.0)
     mysql2 (0.5.6)
@@ -465,7 +466,8 @@ GEM
       nokogiri (~> 1.8)
     yard (0.9.36)
     yomu (0.1.5)
-    zeitwerk (2.6.13)
+    zeitwerk (2.6.16)
+    zlib (3.1.1)
 
 PLATFORMS
   aarch64-linux-musl
@@ -493,6 +495,7 @@ DEPENDENCIES
   jsonpath
   kaminari
   letter_opener
+  minitar
   mysql2 (~> 0.5)
   nokogiri
   pry-byebug
@@ -517,6 +520,7 @@ DEPENDENCIES
   webmock
   yard
   yomu
+  zlib
 
 RUBY VERSION
    ruby 3.2.2p53

diff --git a/app/controllers/requests_controller.rb b/app/controllers/requests_controller.rb
@@ -36,9 +36,11 @@ def harvest_request
 
       @previous_response = Extraction::DocumentExtraction.new(@previous_request).extract
     end
+    document = Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract
+    document.body = "Tar files can't be displayed" if @extraction_definition.format == 'ARCHIVE_JSON'
 
     render json: @request.to_h.merge(
-      preview: Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract
+      preview: document
     )
   end
 

diff --git a/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx b/app/frontend/js/apps/TransformationApp/components/RecordViewer.jsx
@@ -9,7 +9,7 @@ const RecordViewer = ({ record, format }) => {
   const editor = useRef();
 
   function doc() {
-    if (format == "JSON") {
+    if (format == "JSON" || format == "ARCHIVE_JSON") {
       return JSON.stringify(record, null, 2);
     } else if (format == "XML") {
       return xmlFormat(record, { indentation: "  ", lineSeparator: "\n" });

diff --git a/app/frontend/js/components/CodeEditor.jsx b/app/frontend/js/components/CodeEditor.jsx
@@ -11,7 +11,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => {
   const editorRef = useRef();
 
   const editorExtensions = () => {
-    if (format == "JSON") {
+    if (format == "JSON" || format == "ARCHIVE_JSON") {
       return [basicSetup, json(), EditorState.readOnly.of(true)];
     } else if (format == "XML" || format == "HTML") {
       return [
@@ -39,7 +39,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => {
 
   const doc = () => {
     let content;
-    if (format == "JSON") {
+    if (format == "JSON" || format == "ARCHIVE_JSON") {
       try {
         content = JSON.stringify(JSON.parse(initContent), null, 2);
       } catch (err) {

diff --git a/app/frontend/js/editor.js b/app/frontend/js/editor.js
@@ -8,7 +8,7 @@ import { ruby } from "@codemirror/legacy-modes/mode/ruby";
 import { xml } from "@codemirror/legacy-modes/mode/xml";
 
 export function editorExtensions(format, readOnly, formField) {
-  if (format == "JSON") {
+  if (format == "JSON" || format == "ARCHIVE_JSON") {
     return [basicSetup, json(), EditorState.readOnly.of(readOnly)];
   } else if (format == "XML" || format == "HTML") {
     return [
@@ -52,7 +52,7 @@ if (extractionResultViewer) {
   const format = extractionResultViewer.dataset.format;
   let results = extractionResultViewer.dataset.results;
 
-  if (format == "JSON") {
+  if (format == "JSON" || format == "ARCHIVE_JSON") {
     results = JSON.stringify(JSON.parse(results), null, 2);
   } else if (format == "XML") {
     results = xmlFormat(results, { indentation: "  ", lineSeparator: "\n" });

diff --git a/app/frontend/js/modals/transformationDefinitionSettingsModal.js b/app/frontend/js/modals/transformationDefinitionSettingsModal.js
@@ -75,7 +75,7 @@ function displayPreview(data) {
   } = data;
 
   try {
-    if (format == "JSON") {
+    if (format == "JSON" || format == "ARCHIVE_JSON") {
       result = JSON.stringify(JSON.parse(result), null, 2);
     } else if (format == "XML") {
       result = xmlFormat(result, {
@@ -145,7 +145,10 @@ function bindRecordSelectorTestEventListeners(id) {
     (response, _alertClass) => {
       let results = response.data.result;
 
-      if (response.data.format == "JSON") {
+      if (
+        response.data.format == "JSON" ||
+        response.data.format == "ARCHIVE_JSON"
+      ) {
         results = JSON.stringify(response.data.result, null, 2);
       } else if (response.data.format == "XML") {
         results = xmlFormat(response.data.result, {

diff --git a/app/models/extraction_definition.rb b/app/models/extraction_definition.rb
@@ -3,7 +3,7 @@
 # Used to store the information for running an extraction
 #
 class ExtractionDefinition < ApplicationRecord
-  FORMATS = %w[JSON XML HTML].freeze
+  FORMATS = %w[JSON XML HTML ARCHIVE_JSON].freeze
 
   # The destination is used for Enrichment Extractions
   # To know where to pull the records that are to be enriched from
@@ -59,6 +59,10 @@ def to_h
     }
   end
 
+  def json?
+    format.in? %w[JSON ARCHIVE_JSON]
+  end
+
   def shared?
     @shared = harvest_definitions.count > 1 if @shared.nil?
     @shared

diff --git a/app/models/extraction_job.rb b/app/models/extraction_job.rb
@@ -23,6 +23,7 @@ class ExtractionJob < ApplicationRecord
   end
 
   delegate :format, to: :extraction_definition
+  delegate :json?, to: :extraction_definition
 
   # Returns the fullpath to the extraction folder for this job
   #

diff --git a/app/supplejack/extraction/archive.rb b/app/supplejack/extraction/archive.rb
@@ -0,0 +1,47 @@
+# frozen_string_literal: true
+
+require 'archive/tar/minitar'
+
+module Extraction
+  class Archive
+    def self.gzipped?(gz_file_path)
+      File.open(gz_file_path, 'rb') do |file|
+        magic_number = file.read(2).unpack('C2')
+        return magic_number == [0x1F, 0x8B]
+      end
+    end
+
+    def self.extract_from_gz(gz_path)
+      Zlib::GzipReader.open(gz_path, &:read)
+    end
+
+    def self.top_level_entry?(full_name)
+      count = full_name.count('/')
+      return true if count.zero?
+      return true if count == 1 && full_name[-1] == '/'
+
+      false
+    end
+
+    def self.list_top_level_entries(input)
+      top_level_entries = []
+      input.each do |entry|
+        full_name = entry.full_name
+        top_level_entries << full_name if Archive.top_level_entry?(full_name)
+      end
+
+      top_level_entries
+    end
+
+    def self.body(extracted_file_path)
+      body = if Archive.gzipped?(extracted_file_path)
+               Archive.extract_from_gz(extracted_file_path)
+             else
+               File.read(extracted_file_path)
+             end
+
+      File.delete(extracted_file_path)
+      body
+    end
+  end
+end
diff --git a/app/supplejack/extraction/archive_extraction.rb b/app/supplejack/extraction/archive_extraction.rb
@@ -0,0 +1,96 @@
+# frozen_string_literal: true
+
+module Extraction
+  class ArchiveExtraction < AbstractExtraction
+    def initialize(request, extraction_folder = nil, response = nil)
+      super()
+      @request = request
+      @extraction_folder = extraction_folder
+      @response = response
+      @document = nil
+    end
+
+    def download_archive
+      extract
+    end
+
+    def save_entries(extraction_folder)
+      each_entry(extraction_folder) do |_, name|
+        current_document = build_doc("#{extraction_folder}/#{name}")
+        save(current_document)
+        next_page
+      end
+    ensure
+      clear_extracted_archive(extraction_folder)
+    end
+
+    private
+
+    def each_entry(extraction_folder)
+      Minitar::Input.open(StringIO.new(@document.body)) do |input|
+        input.each do |entry|
+          input.extract_entry(extraction_folder, entry) do |action, name, stats|
+            next if action != :file_done
+
+            yield(action, name, stats)
+          end
+        end
+      end
+    end
+
+    def clear_extracted_archive(extraction_folder)
+      top_level_entries = []
+      Minitar::Input.open(StringIO.new(@document.body)) do |input|
+        top_level_entries = Archive.list_top_level_entries(input)
+      end
+      FileUtils.rm_rf(top_level_entries.compact_blank.map { |file| "#{extraction_folder}/#{file}" })
+    end
+
+    def build_doc(entry_path)
+      Document.new(
+        url: @document.url,
+        method: @document.method,
+        params: @document.params,
+        request_headers: @document.request_headers,
+        status: @document.status,
+        response_headers: @document.response_headers,
+        body: Archive.body(entry_path)
+      )
+    end
+
+    def extraction_definition
+      @request.extraction_definition
+    end
+
+    def next_page
+      extraction_definition.page += 1
+    end
+
+    def save(doc)
+      raise ArgumentError, 'extraction_folder was not provided in #new' if @extraction_folder.blank?
+      raise '#extract must be called before #save AbstractExtraction' if @document.blank?
+
+      doc.save(file_path)
+    end
+
+    def file_path
+      page_str = format('%09d', extraction_definition.page)[-9..]
+      name_str = extraction_definition.name.parameterize(separator: '_')
+      "#{@extraction_folder}/#{name_str}__-__#{page_str}.json"
+    end
+
+    def url
+      @request.url(@response)
+    end
+
+    def params
+      @request.query_parameters(@response)
+    end
+
+    def headers
+      return super if @request.headers.blank?
+
+      super.merge(@request.headers(@response))
+    end
+  end
+end
diff --git a/app/supplejack/extraction/document.rb b/app/supplejack/extraction/document.rb
@@ -4,7 +4,8 @@ module Extraction
   # Manages the filesystem part of the request object
   # Saves it to filesystem and loads it in memory
   class Document
-    attr_reader :status, :request_headers, :response_headers, :body, :url, :method, :params, :file_path
+    attr_reader :status, :request_headers, :response_headers, :url, :method, :params, :file_path
+    attr_accessor :body
 
     def initialize(file_path = nil, **kwargs)
       @file_path = file_path

diff --git a/app/supplejack/extraction/execution.rb b/app/supplejack/extraction/execution.rb
@@ -1,5 +1,8 @@
 # frozen_string_literal: true
 
+require 'zlib'
+require 'archive/tar/minitar'
+
 module Extraction
   # Performs the work as defined in the document extraction
   class Execution
@@ -11,24 +14,38 @@ def initialize(job, extraction_definition)
     end
 
     def call
-      extract_and_save_document(@extraction_definition.requests.first)
+      extract(@extraction_definition.requests.first)
       return if @extraction_job.is_sample? || set_number_reached?
       return unless @extraction_definition.paginated?
 
       loop do
-        @extraction_definition.page += 1
-
-        extract_and_save_document(@extraction_definition.requests.last)
-
+        next_page
+        extract(@extraction_definition.requests.last)
         throttle
 
-        break if @extraction_job.reload.cancelled?
+        break if execution_cancelled?
         break if stop_condition_met?
       end
     end
 
     private
 
+    def extract(request)
+      if @extraction_definition.format == 'ARCHIVE_JSON'
+        extract_archive_and_save(request)
+      else
+        extract_and_save_document(request)
+      end
+    end
+
+    def next_page
+      @extraction_definition.page += 1
+    end
+
+    def execution_cancelled?
+      @extraction_job.reload.cancelled?
+    end
+
     def stop_condition_met?
       [set_number_reached?, extraction_failed?, duplicate_document_extracted?, custom_stop_conditions_met?].any?(true)
     end
@@ -53,15 +70,23 @@ def duplicate_document_extracted?
     end
 
     def custom_stop_conditions_met?
-      return false if @extraction_definition.stop_conditions.empty?
+      stop_conditions = @extraction_definition.stop_conditions
+      return false if stop_conditions.empty?
 
-      @extraction_definition.stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true)
+      stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true)
     end
 
     def throttle
       sleep @extraction_definition.throttle / 1000.0
     end
 
+    def extract_archive_and_save(request)
+      extraction_folder = @extraction_job.extraction_folder
+      @de = ArchiveExtraction.new(request, extraction_folder, @previous_request)
+      @de.download_archive
+      @de.save_entries(extraction_folder)
+    end
+
     def extract_and_save_document(request)
       @de = DocumentExtraction.new(request, @extraction_job.extraction_folder, @previous_request)
       @previous_request = @de.extract
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,6 +23,7 @@ class ExtractionJob < ApplicationRecord @@
       end
       delegate :format, to: :extraction_definition
+      delegate :json?, to: :extraction_definition
       # Returns the fullpath to the extraction folder for this job
       #
@@ Expand Down @@