diff --git a/.github/workflows/mirror_to_gitlab.yml b/.github/workflows/mirror_to_gitlab.yml deleted file mode 100644 index 729c982b..00000000 --- a/.github/workflows/mirror_to_gitlab.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Mirror to GitLab - -on: [push] - -jobs: - mirror-repository: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: mirror-repository - uses: yesolutions/mirror-action@master - with: - REMOTE: ${{ secrets.GITLAB_REPO_SSH }} - GIT_SSH_PRIVATE_KEY: ${{ secrets.GIT_SSH_PRIVATE_KEY }} - GIT_SSH_NO_VERIFY_HOST: ${{ secrets.GIT_SSH_NO_VERIFY_HOST }} diff --git a/.gitignore b/.gitignore index ef7f005c..15b53f5d 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,7 @@ # Ignore master key for decrypting credentials and more. /config/master.key +/config/credentials.yml.enc /app/assets/builds/* !/app/assets/builds/.keep diff --git a/Dockerfile b/Dockerfile index 87c476af..7ca41f3e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM ruby:3.2.2-alpine3.18 WORKDIR /app ARG BUILD_PACKAGES="build-base curl-dev git" -ARG DEV_PACKAGES="bash mysql-client mariadb-dev yaml-dev zlib-dev nodejs yarn libxml2 libxml2-dev libxslt libxslt-dev gmp-dev" +ARG DEV_PACKAGES="bash mysql-client mariadb-dev yaml-dev zlib-dev nodejs yarn libxml2 libxml2-dev libxslt libxslt-dev gmp-dev openjdk8-jre" ARG RUBY_PACKAGES="tzdata" WORKDIR /app @@ -34,6 +34,8 @@ ARG RAILS_ENV="production" ENV RAILS_ENV=$RAILS_ENV ARG RAILS_MASTER_KEY ENV RAILS_MASTER_KEY=$RAILS_MASTER_KEY +ARG RAILS_SECRET_KEY_BASE +ENV SECRET_KEY_BASE=$RAILS_SECRET_KEY_BASE RUN bundle exec rails assets:precompile diff --git a/Gemfile b/Gemfile index ef72ad3d..37722f37 100644 --- a/Gemfile +++ b/Gemfile @@ -42,6 +42,7 @@ gem 'faraday-follow_redirects' gem 'jsonpath' gem 'nokogiri' gem 'sidekiq' +gem 'yomu' # transformation related gem 'webmock' diff --git a/Gemfile.lock b/Gemfile.lock index 1b9a7b1a..180902a6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -13,69 +13,69 @@ GIT GEM remote: https://rubygems.org/ specs: - actioncable (7.0.7.2) - actionpack (= 7.0.7.2) - activesupport (= 7.0.7.2) + actioncable (7.0.8.1) + actionpack (= 7.0.8.1) + activesupport (= 7.0.8.1) nio4r (~> 2.0) websocket-driver (>= 0.6.1) - actionmailbox (7.0.7.2) - actionpack (= 7.0.7.2) - activejob (= 7.0.7.2) - activerecord (= 7.0.7.2) - activestorage (= 7.0.7.2) - activesupport (= 7.0.7.2) + actionmailbox (7.0.8.1) + actionpack (= 7.0.8.1) + activejob (= 7.0.8.1) + activerecord (= 7.0.8.1) + activestorage (= 7.0.8.1) + activesupport (= 7.0.8.1) mail (>= 2.7.1) net-imap net-pop net-smtp - actionmailer (7.0.7.2) - actionpack (= 7.0.7.2) - actionview (= 7.0.7.2) - activejob (= 7.0.7.2) - activesupport (= 7.0.7.2) + actionmailer (7.0.8.1) + actionpack (= 7.0.8.1) + actionview (= 7.0.8.1) + activejob (= 7.0.8.1) + activesupport (= 7.0.8.1) mail (~> 2.5, >= 2.5.4) net-imap net-pop net-smtp rails-dom-testing (~> 2.0) - actionpack (7.0.7.2) - actionview (= 7.0.7.2) - activesupport (= 7.0.7.2) + actionpack (7.0.8.1) + actionview (= 7.0.8.1) + activesupport (= 7.0.8.1) rack (~> 2.0, >= 2.2.4) rack-test (>= 0.6.3) rails-dom-testing (~> 2.0) rails-html-sanitizer (~> 1.0, >= 1.2.0) - actiontext (7.0.7.2) - actionpack (= 7.0.7.2) - activerecord (= 7.0.7.2) - activestorage (= 7.0.7.2) - activesupport (= 7.0.7.2) + actiontext (7.0.8.1) + actionpack (= 7.0.8.1) + activerecord (= 7.0.8.1) + activestorage (= 7.0.8.1) + activesupport (= 7.0.8.1) globalid (>= 0.6.0) nokogiri (>= 1.8.5) - actionview (7.0.7.2) - activesupport (= 7.0.7.2) + actionview (7.0.8.1) + activesupport (= 7.0.8.1) builder (~> 3.1) erubi (~> 1.4) rails-dom-testing (~> 2.0) rails-html-sanitizer (~> 1.1, >= 1.2.0) - activejob (7.0.7.2) - activesupport (= 7.0.7.2) + activejob (7.0.8.1) + activesupport (= 7.0.8.1) globalid (>= 0.3.6) - activemodel (7.0.7.2) - activesupport (= 7.0.7.2) - activerecord (7.0.7.2) - activemodel (= 7.0.7.2) - activesupport (= 7.0.7.2) + activemodel (7.0.8.1) + activesupport (= 7.0.8.1) + activerecord (7.0.8.1) + activemodel (= 7.0.8.1) + activesupport (= 7.0.8.1) activerecord-nulldb-adapter (0.9.0) activerecord (>= 5.2.0, < 7.1) - activestorage (7.0.7.2) - actionpack (= 7.0.7.2) - activejob (= 7.0.7.2) - activerecord (= 7.0.7.2) - activesupport (= 7.0.7.2) + activestorage (7.0.8.1) + actionpack (= 7.0.8.1) + activejob (= 7.0.8.1) + activerecord (= 7.0.8.1) + activesupport (= 7.0.8.1) marcel (~> 1.0) mini_mime (>= 1.1.0) - activesupport (7.0.7.2) + activesupport (7.0.8.1) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 1.6, < 2) minitest (>= 5.1) @@ -83,7 +83,7 @@ GEM addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) ast (2.4.2) - bcrypt (3.1.18) + bcrypt (3.1.20) better_html (2.0.2) actionview (>= 6.0) activesupport (>= 6.0) @@ -108,16 +108,16 @@ GEM chunky_png (1.4.0) coderay (1.1.3) colorize (1.1.0) - concurrent-ruby (1.2.2) + concurrent-ruby (1.2.3) connection_pool (2.4.1) crack (0.4.5) rexml crass (1.0.6) - date (3.3.3) + date (3.3.4) debug (1.7.2) irb (>= 1.5.0) reline (>= 0.3.1) - devise (4.9.2) + devise (4.9.3) bcrypt (~> 3.0) orm_adapter (~> 0.1) railties (>= 4.1.0) @@ -128,7 +128,7 @@ GEM devise (~> 4.0) railties (~> 7.0) rotp (~> 6.0) - devise_invitable (2.0.8) + devise_invitable (2.0.9) actionmailer (>= 5.0) devise (>= 4.6) diff-lcs (1.5.0) @@ -171,7 +171,7 @@ GEM fugit (1.8.1) et-orbi (~> 1, >= 1.2.7) raabro (~> 1.4) - globalid (1.2.0) + globalid (1.2.1) activesupport (>= 6.1) hashdiff (1.0.1) http (5.1.1) @@ -183,7 +183,7 @@ GEM http-cookie (1.0.5) domain_name (~> 0.5) http-form_data (2.3.0) - i18n (1.14.1) + i18n (1.14.4) concurrent-ruby (~> 1.0) io-console (0.6.0) irb (1.6.4) @@ -210,7 +210,7 @@ GEM llhttp-ffi (0.4.0) ffi-compiler (~> 1.0) rake (~> 13.0) - loofah (2.21.3) + loofah (2.22.0) crass (~> 1.0.2) nokogiri (>= 1.12.0) mail (2.8.1) @@ -218,24 +218,24 @@ GEM net-imap net-pop net-smtp - marcel (1.0.2) + marcel (1.0.4) matrix (0.4.2) method_source (1.0.0) mime-types (3.4.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.0218.1) mini_mime (1.1.5) - minitest (5.19.0) + minitest (5.22.2) multi_json (1.15.0) - mysql2 (0.5.5) - net-imap (0.3.7) + mysql2 (0.5.6) + net-imap (0.4.10) date net-protocol net-pop (0.1.2) net-protocol - net-protocol (0.2.1) + net-protocol (0.2.2) timeout - net-smtp (0.3.3) + net-smtp (0.4.0.1) net-protocol netrc (0.11.0) nio4r (2.7.0) @@ -261,27 +261,27 @@ GEM nio4r (~> 2.0) raabro (1.4.0) racc (1.7.3) - rack (2.2.8) + rack (2.2.8.1) rack-mini-profiler (3.3.0) rack (>= 1.2.0) rack-proxy (0.7.6) rack rack-test (2.1.0) rack (>= 1.3) - rails (7.0.7.2) - actioncable (= 7.0.7.2) - actionmailbox (= 7.0.7.2) - actionmailer (= 7.0.7.2) - actionpack (= 7.0.7.2) - actiontext (= 7.0.7.2) - actionview (= 7.0.7.2) - activejob (= 7.0.7.2) - activemodel (= 7.0.7.2) - activerecord (= 7.0.7.2) - activestorage (= 7.0.7.2) - activesupport (= 7.0.7.2) + rails (7.0.8.1) + actioncable (= 7.0.8.1) + actionmailbox (= 7.0.8.1) + actionmailer (= 7.0.8.1) + actionpack (= 7.0.8.1) + actiontext (= 7.0.8.1) + actionview (= 7.0.8.1) + activejob (= 7.0.8.1) + activemodel (= 7.0.8.1) + activerecord (= 7.0.8.1) + activestorage (= 7.0.8.1) + activesupport (= 7.0.8.1) bundler (>= 1.15.0) - railties (= 7.0.7.2) + railties (= 7.0.8.1) rails-dom-testing (2.2.0) activesupport (>= 5.0.0) minitest @@ -289,21 +289,21 @@ GEM rails-html-sanitizer (1.6.0) loofah (~> 2.21) nokogiri (~> 1.14) - railties (7.0.7.2) - actionpack (= 7.0.7.2) - activesupport (= 7.0.7.2) + railties (7.0.8.1) + actionpack (= 7.0.8.1) + activesupport (= 7.0.8.1) method_source rake (>= 12.2) thor (~> 1.0) zeitwerk (~> 2.5) rainbow (3.1.1) - rake (13.0.6) + rake (13.1.0) redis-client (0.17.0) connection_pool regexp_parser (2.8.0) reline (0.3.3) io-console (~> 0.5) - responders (3.1.0) + responders (3.1.1) actionpack (>= 5.2) railties (>= 5.2) rest-client (2.1.0) @@ -394,8 +394,8 @@ GEM actionpack (>= 5.2) activesupport (>= 5.2) sprockets (>= 3.0.0) - thor (1.2.2) - timeout (0.4.0) + thor (1.3.1) + timeout (0.4.1) tzinfo (2.0.6) concurrent-ruby (~> 1.0) unf (0.1.4) @@ -425,8 +425,9 @@ GEM websocket-extensions (0.1.5) xpath (3.2.0) nokogiri (~> 1.8) - yard (0.9.34) - zeitwerk (2.6.11) + yard (0.9.36) + yomu (0.1.5) + zeitwerk (2.6.13) PLATFORMS aarch64-linux-musl @@ -475,6 +476,7 @@ DEPENDENCIES webdrivers webmock yard + yomu RUBY VERSION ruby 3.2.2p53 diff --git a/app/controllers/extraction_definitions_controller.rb b/app/controllers/extraction_definitions_controller.rb index af756cea..72546bf7 100644 --- a/app/controllers/extraction_definitions_controller.rb +++ b/app/controllers/extraction_definitions_controller.rb @@ -98,7 +98,8 @@ def find_destinations def extraction_definition_params safe_params = params.require(:extraction_definition).permit( :pipeline_id, :name, :format, :base_url, :throttle, :page, :per_page, - :total_selector, :kind, :destination_id, :source_id, :enrichment_url, :paginated, :split, :split_selector + :total_selector, :kind, :destination_id, :source_id, :enrichment_url, :paginated, :split, :split_selector, + :extract_text_from_file ) merge_last_edited_by(safe_params) end diff --git a/app/frontend/js/apps/ExtractionApp/components/HeaderActions.jsx b/app/frontend/js/apps/ExtractionApp/components/HeaderActions.jsx index e779c2eb..54a93b36 100644 --- a/app/frontend/js/apps/ExtractionApp/components/HeaderActions.jsx +++ b/app/frontend/js/apps/ExtractionApp/components/HeaderActions.jsx @@ -60,11 +60,12 @@ const HeaderActions = () => { return createPortal( <> - {!appDetails.extractionDefinition.split && ( - - )} + {!appDetails.extractionDefinition.split && + !appDetails.extractionDefinition.extract_text_from_file && ( + + )} {appDetails.extractionDefinition.split && } diff --git a/app/models/extraction_definition.rb b/app/models/extraction_definition.rb index 4b7e53e5..4b683dd7 100644 --- a/app/models/extraction_definition.rb +++ b/app/models/extraction_definition.rb @@ -36,6 +36,7 @@ class ExtractionDefinition < ApplicationRecord validates :name, uniqueness: true validates :split_selector, presence: true, if: :split? + validates :s3_bucket, presence: true, if: :s3? validates :throttle, numericality: { only_integer: true, greater_than_or_equal_to: 0, less_than_or_equal_to: 60_000 } diff --git a/app/models/harvest_report.rb b/app/models/harvest_report.rb index 10d2df07..8b67013a 100644 --- a/app/models/harvest_report.rb +++ b/app/models/harvest_report.rb @@ -115,6 +115,7 @@ def statuses def idle_offset return 0 if extraction_end_time.blank? return @idle_offset if @idle_offset.present? + return 0 if transformation_start_time.blank? || extraction_end_time.blank? @idle_offset = transformation_start_time - extraction_end_time @idle_offset = 0 if @idle_offset.negative? diff --git a/app/sidekiq/extraction_worker.rb b/app/sidekiq/extraction_worker.rb index e3a0bf62..8bf9ee97 100644 --- a/app/sidekiq/extraction_worker.rb +++ b/app/sidekiq/extraction_worker.rb @@ -16,6 +16,10 @@ def child_perform(extraction_job) SplitWorker.perform_async(extraction_job.id) if extraction_job.extraction_definition.split end + + return unless extraction_job.extraction_definition.extract_text_from_file? + + TextExtractionWorker.perform_async(extraction_job.id) end def job_start diff --git a/app/sidekiq/file_extraction_worker.rb b/app/sidekiq/file_extraction_worker.rb new file mode 100644 index 00000000..eb1b3a73 --- /dev/null +++ b/app/sidekiq/file_extraction_worker.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +class FileExtractionWorker + include Sidekiq::Job + + sidekiq_options retry: 0 + + def perform(extraction_job_id) + initialize_instance_variables(extraction_job_id) + + reset_harvest_report(harvest_report) if @extraction_job.harvest_job.present? + + setup_tmp_directory + move_extracted_documents_into_tmp_directory + process_extracted_documents + + FileUtils.remove_dir(@tmp_directory) + + create_transformation_jobs if @extraction_job.harvest_job.present? + end + + private + + def initialize_instance_variables(extraction_job_id) + @extraction_job = ExtractionJob.find(extraction_job_id) + @extraction_definition = @extraction_job.extraction_definition + @extraction_folder = @extraction_job.extraction_folder + @tmp_directory = "#{@extraction_folder}/tmp" + @page = 1 + end + + def create_transformation_jobs + (@extraction_job.extraction_definition.page..@extraction_job.documents.total_pages).each do |page| + create_transformation_job(page) + pipeline_job.reload + break if pipeline_job.cancelled? + end + end + + def harvest_report + @extraction_job.harvest_job.harvest_report + end + + def pipeline_job + harvest_report.pipeline_job + end + + def create_transformation_job(page) + TransformationWorker.perform_async(@extraction_job.harvest_job.id, page, api_record_id(page)) + harvest_report.increment_transformation_workers_queued! + end + + def api_record_id(page) + return nil unless @extraction_job.extraction_definition.enrichment? + + @extraction_job.documents[page].file_path.match(/__(?.+)__/)[:record_id] + end + + def reset_harvest_report(harvest_report) + harvest_report.transformation_queued! + harvest_report.load_queued! + end + + def setup_tmp_directory + return if Dir.exist?(@tmp_directory) + + Dir.mkdir(@tmp_directory) + end + + def move_extracted_documents_into_tmp_directory + Dir.children(@extraction_folder).each do |file| + next if file == 'tmp' + + FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}") + end + end + + def process_extracted_documents + raise 'process_extracted_documents not defined in child class' + end + + def create_document + raise 'create_document not defined in child class' + end +end diff --git a/app/sidekiq/split_worker.rb b/app/sidekiq/split_worker.rb index c6e85440..881b5867 100644 --- a/app/sidekiq/split_worker.rb +++ b/app/sidekiq/split_worker.rb @@ -1,63 +1,6 @@ # frozen_string_literal: true -class SplitWorker - include Sidekiq::Job - - sidekiq_options retry: 0 - - def perform(extraction_job_id) - @extraction_job = ExtractionJob.find(extraction_job_id) - @extraction_definition = @extraction_job.extraction_definition - @extraction_folder = @extraction_job.extraction_folder - @tmp_directory = "#{@extraction_folder}/tmp" - @page = 1 - - setup_tmp_directory - move_extracted_documents_into_tmp_directory - process_extracted_documents - - FileUtils.remove_dir(@tmp_directory) - - create_transformation_jobs if @extraction_job.harvest_job.present? - end - - private - - def create_transformation_jobs - harvest_report = @extraction_job.harvest_job.harvest_report - pipeline_job = harvest_report.pipeline_job - - reset_harvest_report(harvest_report) - - (@extraction_job.extraction_definition.page..@extraction_job.documents.total_pages).each do |page| - harvest_report.increment_pages_extracted! - TransformationWorker.perform_async(@extraction_job.harvest_job.id, page) - harvest_report.increment_transformation_workers_queued! - - pipeline_job.reload - break if pipeline_job.cancelled? - end - end - - def reset_harvest_report(harvest_report) - harvest_report.update(pages_extracted: 0) - harvest_report.extraction_completed! - end - - def setup_tmp_directory - return if Dir.exist?(@tmp_directory) - - Dir.mkdir(@tmp_directory) - end - - def move_extracted_documents_into_tmp_directory - Dir.children(@extraction_folder).each do |file| - next if file == 'tmp' - - FileUtils.move("#{@extraction_folder}/#{file}", "#{@tmp_directory}/#{file}") - end - end - +class SplitWorker < FileExtractionWorker def process_extracted_documents Dir.children(@tmp_directory).each do |file| saved_response = JSON.parse(File.read("#{@tmp_directory}/#{file}")) diff --git a/app/sidekiq/text_extraction_worker.rb b/app/sidekiq/text_extraction_worker.rb new file mode 100644 index 00000000..a8a55173 --- /dev/null +++ b/app/sidekiq/text_extraction_worker.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class TextExtractionWorker < FileExtractionWorker + def process_extracted_documents + Dir.children(@tmp_directory).each do |file| + saved_file = File.read("#{@tmp_directory}/#{file}") + + saved_response = { 'method' => 'GET', 'status' => 200, 'response_headers' => [], 'request_headers' => [] } + + create_document(Yomu.read(:text, saved_file), saved_response, file) + @page += 1 + end + + @extraction_definition.update(format: 'JSON') + end + + def create_document(extracted_text, saved_response, filename) + Extraction::Document.new( + url: saved_response['url'], method: saved_response['method'], + params: saved_response['params'], request_headers: saved_response['request_headers'], + status: saved_response['status'], response_headers: saved_response['response_headers'], + body: { text: extracted_text }.to_json + ).save("#{@extraction_folder}/#{filename}") + end +end diff --git a/app/supplejack/extraction/document.rb b/app/supplejack/extraction/document.rb index 1c284189..7c2a7ee7 100644 --- a/app/supplejack/extraction/document.rb +++ b/app/supplejack/extraction/document.rb @@ -23,6 +23,11 @@ def successful? def save(file_path) File.write(file_path, to_json) + # If the file fails to be converted to a JSON document + # write the original file to the filepath as a binary + # It is probably a PDF or Word Doc + rescue JSON::GeneratorError + File.write(file_path, @body, mode: 'wb') end def size_in_bytes @@ -35,6 +40,8 @@ def self.load_from_file(file_path) Rails.logger.debug { "Loading document #{file_path}" } json = JSON.parse(File.read(file_path)).symbolize_keys Document.new(file_path, **json) + rescue JSON::ParserError + {} end def to_hash diff --git a/app/supplejack/extraction/enrichment_execution.rb b/app/supplejack/extraction/enrichment_execution.rb index 7b370370..0bf3a2e9 100644 --- a/app/supplejack/extraction/enrichment_execution.rb +++ b/app/supplejack/extraction/enrichment_execution.rb @@ -58,6 +58,7 @@ def new_enrichment_extraction(api_record, page) def enqueue_record_transformation(api_record, document, page) return unless @harvest_job.present? && document.successful? + return if @extraction_definition.extract_text_from_file? TransformationWorker.perform_async(@harvest_job.id, page, api_record['id']) @harvest_report.increment_transformation_workers_queued! if @harvest_report.present? diff --git a/app/supplejack/extraction/execution.rb b/app/supplejack/extraction/execution.rb index a5ccf37d..e0231203 100644 --- a/app/supplejack/extraction/execution.rb +++ b/app/supplejack/extraction/execution.rb @@ -79,10 +79,15 @@ def extract_and_save_document(request) end def enqueue_record_transformation - return unless @harvest_job.present? && @de.document.successful? && !@extraction_definition.split + return unless @harvest_job.present? && @de.document.successful? + return if requires_additional_processing? TransformationWorker.perform_async(@harvest_job.id, @extraction_definition.page) @harvest_report.increment_transformation_workers_queued! end + + def requires_additional_processing? + @extraction_definition.split? || @extraction_definition.extract_text_from_file? + end end end diff --git a/app/supplejack/transformation/transformed_record.rb b/app/supplejack/transformation/transformed_record.rb index a72fbe12..f5c10d64 100644 --- a/app/supplejack/transformation/transformed_record.rb +++ b/app/supplejack/transformation/transformed_record.rb @@ -23,14 +23,8 @@ def errors end end - def rejection_reasons - @reject_fields.each_with_object([]) do |field, reasons| - reasons.push(field.name) if field.value == true - end - end - - def deletion_reasons - @delete_fields.each_with_object([]) do |field, reasons| + def reasons(fields) + fields.each_with_object([]) do |field, reasons| reasons.push(field.name) if field.value == true end end @@ -39,8 +33,8 @@ def to_hash { 'transformed_record' => transformed_record, 'errors' => errors, - 'rejection_reasons' => rejection_reasons, - 'deletion_reasons' => deletion_reasons + 'rejection_reasons' => reasons(@reject_fields), + 'deletion_reasons' => reasons(@delete_fields) } end end diff --git a/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb b/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb index bf014d32..d1d8034b 100644 --- a/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb +++ b/app/views/extraction_definitions/_create_edit_enrichment_modal.html.erb @@ -84,8 +84,28 @@
- <%= form.select :destination_id, options_from_collection_for_select(@destinations, 'id', 'name'), {}, - class: 'form-select' %> + <%= form.select(:destination_id, + options_from_collection_for_select( + @destinations, 'id', 'name', @extraction_definition&.destination&.id + ), + {}, class: 'form-select') %> +
+ +
+ <%= form.label :extract_text_from_file, class: 'form-label' do %> + Extract text from file + + + + <% end %> +
+ +
+ <%= form.select :extract_text_from_file, options_for_select( + [%w[No false], %w[Yes true]], model.extract_text_from_file + ), {}, class: 'form-select' %>
diff --git a/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb b/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb index 15d23b7e..392203a6 100644 --- a/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb +++ b/app/views/extraction_definitions/_create_edit_harvest_modal.html.erb @@ -98,6 +98,23 @@ ), {}, class: 'form-select' %> +
+ <%= form.label :extract_text_from_file, class: 'form-label' do %> + Extract text from file + + + + <% end %> +
+ +
+ <%= form.select :extract_text_from_file, options_for_select( + [%w[No false], %w[Yes true]], model.extract_text_from_file + ), {}, class: 'form-select' %> +
+
<%= form.label :split, class: 'form-label' do %> Split diff --git a/app/views/extraction_jobs/show.html.erb b/app/views/extraction_jobs/show.html.erb index d8ab34e6..aba9b832 100644 --- a/app/views/extraction_jobs/show.html.erb +++ b/app/views/extraction_jobs/show.html.erb @@ -58,6 +58,8 @@

The job hasn't started yet so there is no results available.

<% elsif @extraction_job.running? %>

The job is currently running. This page might not be fetched yet.

+ <% elsif @extraction_definition.extract_text_from_file %> +

The text has not yet been extracted from the file.

<% else %>

The extracted page could not be found.

<% end %> diff --git a/app/views/pipelines/_card.html.erb b/app/views/pipelines/_card.html.erb index e0fa5a4d..ab932ae4 100644 --- a/app/views/pipelines/_card.html.erb +++ b/app/views/pipelines/_card.html.erb @@ -16,7 +16,7 @@ <% end %>

- <%= send("#{type}_card_subtitle", definition) %> + <%= send(:"#{type}_card_subtitle", definition) %>

<% if type == 'extraction' %> @@ -173,7 +173,7 @@