Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NEW SJ PP HARVEST: As Dan, I want new SJ to be able to harvest from the PapersPast open data set, so that we can start experimenting with how to feasibly bring the whole collection into our system #70

Merged
merged 3 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ gem 'rqrcode'
gem 'faraday', '~> 2.9'
gem 'faraday-follow_redirects'
gem 'jsonpath'
gem 'minitar'
gem 'nokogiri'
gem 'sidekiq'
gem 'yomu'
gem 'zlib'

# transformation related
gem 'webmock'
Expand Down
8 changes: 6 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ GEM
mime-types-data (~> 3.2015)
mime-types-data (3.2024.0507)
mini_mime (1.1.5)
minitest (5.22.3)
minitar (0.9)
minitest (5.19.0)
multi_json (1.15.0)
mutex_m (0.2.0)
mysql2 (0.5.6)
Expand Down Expand Up @@ -465,7 +466,8 @@ GEM
nokogiri (~> 1.8)
yard (0.9.36)
yomu (0.1.5)
zeitwerk (2.6.13)
zeitwerk (2.6.16)
zlib (3.1.1)

PLATFORMS
aarch64-linux-musl
Expand Down Expand Up @@ -493,6 +495,7 @@ DEPENDENCIES
jsonpath
kaminari
letter_opener
minitar
mysql2 (~> 0.5)
nokogiri
pry-byebug
Expand All @@ -517,6 +520,7 @@ DEPENDENCIES
webmock
yard
yomu
zlib

RUBY VERSION
ruby 3.2.2p53
Expand Down
4 changes: 3 additions & 1 deletion app/controllers/requests_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def harvest_request

@previous_response = Extraction::DocumentExtraction.new(@previous_request).extract
end
document = Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract
document.body = "Tar files can't be displayed" if @extraction_definition.format == 'ARCHIVE_JSON'

render json: @request.to_h.merge(
preview: Extraction::DocumentExtraction.new(@request, nil, @previous_response).extract
preview: document
)
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const RecordViewer = ({ record, format }) => {
const editor = useRef();

function doc() {
if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
return JSON.stringify(record, null, 2);
} else if (format == "XML") {
return xmlFormat(record, { indentation: " ", lineSeparator: "\n" });
Expand Down
4 changes: 2 additions & 2 deletions app/frontend/js/components/CodeEditor.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => {
const editorRef = useRef();

const editorExtensions = () => {
if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
return [basicSetup, json(), EditorState.readOnly.of(true)];
} else if (format == "XML" || format == "HTML") {
return [
Expand Down Expand Up @@ -39,7 +39,7 @@ const CodeEditor = ({ initContent, onChange, format = "ruby", ...props }) => {

const doc = () => {
let content;
if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
try {
content = JSON.stringify(JSON.parse(initContent), null, 2);
} catch (err) {
Expand Down
4 changes: 2 additions & 2 deletions app/frontend/js/editor.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import { ruby } from "@codemirror/legacy-modes/mode/ruby";
import { xml } from "@codemirror/legacy-modes/mode/xml";

export function editorExtensions(format, readOnly, formField) {
if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
return [basicSetup, json(), EditorState.readOnly.of(readOnly)];
} else if (format == "XML" || format == "HTML") {
return [
Expand Down Expand Up @@ -52,7 +52,7 @@ if (extractionResultViewer) {
const format = extractionResultViewer.dataset.format;
let results = extractionResultViewer.dataset.results;

if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
results = JSON.stringify(JSON.parse(results), null, 2);
} else if (format == "XML") {
results = xmlFormat(results, { indentation: " ", lineSeparator: "\n" });
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ function displayPreview(data) {
} = data;

try {
if (format == "JSON") {
if (format == "JSON" || format == "ARCHIVE_JSON") {
result = JSON.stringify(JSON.parse(result), null, 2);
} else if (format == "XML") {
result = xmlFormat(result, {
Expand Down Expand Up @@ -145,7 +145,10 @@ function bindRecordSelectorTestEventListeners(id) {
(response, _alertClass) => {
let results = response.data.result;

if (response.data.format == "JSON") {
if (
response.data.format == "JSON" ||
response.data.format == "ARCHIVE_JSON"
) {
results = JSON.stringify(response.data.result, null, 2);
} else if (response.data.format == "XML") {
results = xmlFormat(response.data.result, {
Expand Down
6 changes: 5 additions & 1 deletion app/models/extraction_definition.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Used to store the information for running an extraction
#
class ExtractionDefinition < ApplicationRecord
FORMATS = %w[JSON XML HTML].freeze
FORMATS = %w[JSON XML HTML ARCHIVE_JSON].freeze

# The destination is used for Enrichment Extractions
# To know where to pull the records that are to be enriched from
Expand Down Expand Up @@ -59,6 +59,10 @@ def to_h
}
end

def json?
format.in? %w[JSON ARCHIVE_JSON]
end

def shared?
@shared = harvest_definitions.count > 1 if @shared.nil?
@shared
Expand Down
1 change: 1 addition & 0 deletions app/models/extraction_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class ExtractionJob < ApplicationRecord
end

delegate :format, to: :extraction_definition
delegate :json?, to: :extraction_definition

# Returns the fullpath to the extraction folder for this job
#
Expand Down
47 changes: 47 additions & 0 deletions app/supplejack/extraction/archive.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# frozen_string_literal: true

require 'archive/tar/minitar'

module Extraction
class Archive
def self.gzipped?(gz_file_path)
File.open(gz_file_path, 'rb') do |file|
magic_number = file.read(2).unpack('C2')
return magic_number == [0x1F, 0x8B]
end
end

def self.extract_from_gz(gz_path)
Zlib::GzipReader.open(gz_path, &:read)
end

def self.top_level_entry?(full_name)
count = full_name.count('/')
return true if count.zero?
return true if count == 1 && full_name[-1] == '/'

false
end

def self.list_top_level_entries(input)
top_level_entries = []
input.each do |entry|
full_name = entry.full_name
top_level_entries << full_name if Archive.top_level_entry?(full_name)
end

top_level_entries
end

def self.body(extracted_file_path)
body = if Archive.gzipped?(extracted_file_path)
Archive.extract_from_gz(extracted_file_path)
else
File.read(extracted_file_path)
end

File.delete(extracted_file_path)
body
end
end
end
96 changes: 96 additions & 0 deletions app/supplejack/extraction/archive_extraction.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# frozen_string_literal: true

module Extraction
class ArchiveExtraction < AbstractExtraction
def initialize(request, extraction_folder = nil, response = nil)
super()
@request = request
@extraction_folder = extraction_folder
@response = response
@document = nil
end

def download_archive
extract
end

def save_entries(extraction_folder)
each_entry(extraction_folder) do |_, name|
current_document = build_doc("#{extraction_folder}/#{name}")
save(current_document)
next_page
end
ensure
clear_extracted_archive(extraction_folder)
end

private

def each_entry(extraction_folder)
Minitar::Input.open(StringIO.new(@document.body)) do |input|
input.each do |entry|
input.extract_entry(extraction_folder, entry) do |action, name, stats|
next if action != :file_done

yield(action, name, stats)
end
end
end
end

def clear_extracted_archive(extraction_folder)
top_level_entries = []
Minitar::Input.open(StringIO.new(@document.body)) do |input|
top_level_entries = Archive.list_top_level_entries(input)
end
FileUtils.rm_rf(top_level_entries.compact_blank.map { |file| "#{extraction_folder}/#{file}" })
end

def build_doc(entry_path)
Document.new(
url: @document.url,
method: @document.method,
params: @document.params,
request_headers: @document.request_headers,
status: @document.status,
response_headers: @document.response_headers,
body: Archive.body(entry_path)
)
end

def extraction_definition
@request.extraction_definition
end

def next_page
extraction_definition.page += 1
end

def save(doc)
raise ArgumentError, 'extraction_folder was not provided in #new' if @extraction_folder.blank?
raise '#extract must be called before #save AbstractExtraction' if @document.blank?

doc.save(file_path)
end

def file_path
page_str = format('%09d', extraction_definition.page)[-9..]
name_str = extraction_definition.name.parameterize(separator: '_')
"#{@extraction_folder}/#{name_str}__-__#{page_str}.json"
end

def url
@request.url(@response)
end

def params
@request.query_parameters(@response)
end

def headers
return super if @request.headers.blank?

super.merge(@request.headers(@response))
end
end
end
3 changes: 2 additions & 1 deletion app/supplejack/extraction/document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ module Extraction
# Manages the filesystem part of the request object
# Saves it to filesystem and loads it in memory
class Document
attr_reader :status, :request_headers, :response_headers, :body, :url, :method, :params, :file_path
attr_reader :status, :request_headers, :response_headers, :url, :method, :params, :file_path
attr_accessor :body

def initialize(file_path = nil, **kwargs)
@file_path = file_path
Expand Down
41 changes: 33 additions & 8 deletions app/supplejack/extraction/execution.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# frozen_string_literal: true

require 'zlib'
require 'archive/tar/minitar'

module Extraction
# Performs the work as defined in the document extraction
class Execution
Expand All @@ -11,24 +14,38 @@ def initialize(job, extraction_definition)
end

def call
extract_and_save_document(@extraction_definition.requests.first)
extract(@extraction_definition.requests.first)
return if @extraction_job.is_sample? || set_number_reached?
return unless @extraction_definition.paginated?

loop do
@extraction_definition.page += 1

extract_and_save_document(@extraction_definition.requests.last)

next_page
extract(@extraction_definition.requests.last)
throttle

break if @extraction_job.reload.cancelled?
break if execution_cancelled?
break if stop_condition_met?
end
end

private

def extract(request)
if @extraction_definition.format == 'ARCHIVE_JSON'
extract_archive_and_save(request)
else
extract_and_save_document(request)
end
end

def next_page
@extraction_definition.page += 1
end

def execution_cancelled?
@extraction_job.reload.cancelled?
end

def stop_condition_met?
[set_number_reached?, extraction_failed?, duplicate_document_extracted?, custom_stop_conditions_met?].any?(true)
end
Expand All @@ -53,15 +70,23 @@ def duplicate_document_extracted?
end

def custom_stop_conditions_met?
return false if @extraction_definition.stop_conditions.empty?
stop_conditions = @extraction_definition.stop_conditions
return false if stop_conditions.empty?

@extraction_definition.stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true)
stop_conditions.map { |condition| condition.evaluate(@de.document.body) }.any?(true)
end

def throttle
sleep @extraction_definition.throttle / 1000.0
end

def extract_archive_and_save(request)
extraction_folder = @extraction_job.extraction_folder
@de = ArchiveExtraction.new(request, extraction_folder, @previous_request)
@de.download_archive
@de.save_entries(extraction_folder)
end

def extract_and_save_document(request)
@de = DocumentExtraction.new(request, @extraction_job.extraction_folder, @previous_request)
@previous_request = @de.extract
Expand Down
Loading
Loading