diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj index a2547f53..edea6ad5 100644 --- a/src/csv2rdf/csvw.clj +++ b/src/csv2rdf/csvw.clj @@ -23,7 +23,7 @@ (defn annotate-tables [tabular-source metadata-source] (processing/get-metadata tabular-source metadata-source)) -(defn- validate-rows +(defn- rows-are-valid? "Validates the CSVW schema for the given tabular file, metadata and options. `tabular-source` and `metadata-source` can be any of the following @@ -36,20 +36,22 @@ If metadata-source is non-nil then processing will start from the asscociated metadata document, otherwise it will start from - tabular-source." + tabular-source. + + Returns true if validation errors were detected" [tabular-source metadata-source] (let [{:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source) table-group-dialect (:dialect metadata) output-tables (remove properties/suppress-output? tables) - ;;ctx (table-group-context mode metadata) ;; TODO this might be useful later when iterating over tables - ] - - (util/liberal-mapcat (fn [{:keys [url dialect] :as table}] - ;;(validated-rows ctx table table-group-dialect) - (let [dialect (or dialect table-group-dialect)] - (csv/annotated-rows url table dialect))) + results (util/liberal-mapcat (fn [{:keys [url dialect] :as table}] + (let [dialect (or dialect table-group-dialect)] + (csv/annotated-rows url table dialect))) - output-tables))) + output-tables)] + (->> results + (mapcat :cells) + (mapcat :errors) + empty?))) (defn only-validate-schema "Only validate the data against the schemas in the metadata file, and @@ -58,14 +60,7 @@ Returns a map with the key `:data-validation-errors?` set to a boolean indicating whether any schema errors occurred." [{:keys [tabular-source metadata-source]}] - (let [errors? (atom false)] - (doseq [{:keys [cells] row-number :source-number :as _row} (validate-rows tabular-source metadata-source) - {:keys [errors column-number column] :as _cell} cells - :when (seq errors)] - (reset! errors? true) - (doseq [error errors] - (println (format "Row #%d col #%d (column '%s') has error: " row-number column-number (:name column)) error))) - {:data-validation-errors? @errors?})) + {:data-validation-errors? (rows-are-valid? tabular-source metadata-source)}) (defn csv->rdf "Runs the CSVW process for the given tabular or metadata data sources diff --git a/src/csv2rdf/logging.clj b/src/csv2rdf/logging.clj index c378adca..e7c89de1 100644 --- a/src/csv2rdf/logging.clj +++ b/src/csv2rdf/logging.clj @@ -29,8 +29,11 @@ (warn [_this _msg]) (error [_this _msg])) -(defn memory-logger [] - (->MemoryLogger (atom []) (atom []))) +(defn memory-logger + ([] + (memory-logger (atom []) (atom []))) + ([warnings errors] + (->MemoryLogger warnings errors))) (def ^:dynamic *logger* (->ForwardingLogger)) diff --git a/src/csv2rdf/tabular/csv.clj b/src/csv2rdf/tabular/csv.clj index 4310a8b1..23d2e93e 100644 --- a/src/csv2rdf/tabular/csv.clj +++ b/src/csv2rdf/tabular/csv.clj @@ -1,5 +1,6 @@ (ns csv2rdf.tabular.csv - (:require [clojure.string :as string] + (:require [clojure.string :as str] + [clojure.string :as string] [csv2rdf.metadata.dialect :as dialect] [csv2rdf.tabular.csv.reader :as reader] [csv2rdf.metadata.table :as table] @@ -29,7 +30,7 @@ (column/from-titles idx titles default-lang)) titles)] {:comments (mapv :comment comment-rows) - :columns columns})) + :columns columns})) (defn ^{:tabular-spec "8.10.3"} validate-data-rows "Validates the data rows in the tabular file and extracts any embedded comments. The row number of any rows @@ -102,9 +103,13 @@ cell)) parsed-cells))) -(defn parse-row-cells [{:keys [cells] :as row} table {:keys [skipColumns] :as options}] +(defn parse-row-cells + [{:keys [cells source-row-number] :as row} + {:keys [url] :as table} + {:keys [skipColumns]}] (let [columns (table/columns table) - cell-values (concat (drop skipColumns cells) (repeat "")) ;;extend cells to cover any virtual columns + ;;extend cells to cover any virtual columns + cell-values (concat (drop skipColumns cells) (repeat "")) cell-column-pairs (map vector cell-values columns) parsed-cells (map-indexed (fn [col-idx [cell column]] (let [result (cell/parse-cell cell column) @@ -115,8 +120,16 @@ :column column))) cell-column-pairs)] ;;log cell errors - (doseq [err (mapcat cell/errors parsed-cells)] - (logging/log-warning err)) + (doseq [{:keys [errors column-number column]} parsed-cells + :when (seq errors)] + (doseq [error errors] + (logging/log-warning + (format "Row #%s col #%s (column '%s') in file: %s has error: %s" + source-row-number + column-number + (:name column) + (last (str/split (.toString url) #"/")) + error)))) (assoc row :parsed-cells parsed-cells))) @@ -130,14 +143,14 @@ (assoc column-value-bindings :_row number :_sourceRow source-row-number))) (defn get-cell-template-bindings [{:keys [column-number source-column-number column] :as cell}] - {:_name (util/percent-decode (properties/column-name column)) - :_column column-number + {:_name (util/percent-decode (properties/column-name column)) + :_column column-number :_sourceColumn source-column-number}) (defn get-cell-urls [bindings table {:keys [column] :as cell}] - (let [property-urls {:aboutUrl (some-> (properties/about-url column) (template-property/resolve-uri-template-property bindings table)) + (let [property-urls {:aboutUrl (some-> (properties/about-url column) (template-property/resolve-uri-template-property bindings table)) :propertyUrl (some-> (properties/property-url column) (template-property/resolve-uri-template-property bindings table)) - :valueUrl (some-> (properties/value-url column) (template-property/resolve-value-uri-template-property cell column bindings table))}] + :valueUrl (some-> (properties/value-url column) (template-property/resolve-value-uri-template-property cell column bindings table))}] (util/filter-values some? property-urls))) (defn annotate-row [{:keys [number source-row-number parsed-cells] :as data-row} table title-column-indexes] @@ -149,10 +162,10 @@ property-urls (get-cell-urls bindings table cell)] (merge cell property-urls))) parsed-cells)] - {:number number + {:number number :source-number source-row-number :cells (vec cells) - :titles (get-row-titles title-column-indexes parsed-cells)})) + :titles (get-row-titles title-column-indexes parsed-cells)})) (defn skip-to-data-rows [rows {:keys [skipRows num-header-rows] :as options}] (let [row-offset (+ skipRows num-header-rows)] diff --git a/test/csv2rdf/main_test.clj b/test/csv2rdf/main_test.clj index 61fda0ff..d934514a 100644 --- a/test/csv2rdf/main_test.clj +++ b/test/csv2rdf/main_test.clj @@ -1,41 +1,58 @@ (ns csv2rdf.main-test - (:require [csv2rdf.main :as sut] + (:require [csv2rdf.logging :as logging] + [csv2rdf.main :as sut] [clojure.test :as t])) ;; See issue 47 ;; Resolving template property URIs with values containing spaces should work -(defmacro capture - "Capture return value of body and stdout, and return a hashmap - of :return-value and :stdout." - [body] - `(let [s# (new java.io.StringWriter)] - (binding [*out* s#] - (let [ret# ~body] - {:return-value ret# - :stdout (str s#)})))) + +(defn test-validate-data [tabular-file + metadata-file + failures] + (let [is-valid? (empty? failures) + warnings (atom []) + errors (atom [])] + (logging/with-logger + (logging/memory-logger warnings errors) + (t/is (= (sut/inner-main ["-t" tabular-file + "-u" metadata-file + "--validate-data"]) + {:data-validation-errors? is-valid?})) + (t/is (= @warnings failures)) + (t/is (= @errors []))))) (t/deftest inner-main-test-validate-data - (t/testing "--validate-data") - (let [{:keys [return-value stdout]} - (capture (sut/inner-main ["-t" "./test/examples/validation/success.csv" - "-u" "./test/examples/validation/named-numbers.json" - "--validate-data"]))] - (t/is (= {:data-validation-errors? false} return-value)) - (t/is (= "" stdout))) + (t/testing "--validate-data" - (let [{:keys [return-value stdout]} - (capture (sut/inner-main ["-t" "./test/examples/validation/fail-1.csv" - "-u" "./test/examples/validation/named-numbers.json" - "--validate-data"]))] - (t/is (= {:data-validation-errors? true} return-value)) - (t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'two' as type 'int': For input string: \"two\"\n" - stdout))) + (test-validate-data + "./test/examples/validation/success.csv" + "./test/examples/validation/named-numbers.json" + []) + (test-validate-data + "./test/examples/validation/fail-1.csv" + "./test/examples/validation/named-numbers.json" + ["Row #3 col #2 (column 'number') in file: fail-1.csv has error: Cannot parse 'two' as type 'int': For input string: \"two\""]) + (test-validate-data + "./test/examples/validation/fail-2.csv" + "./test/examples/validation/named-numbers.json" + ["Row #3 col #2 (column 'number') in file: fail-2.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\""]) + (test-validate-data + "./test/examples/validation/fail-3.csv" + "./test/examples/validation/named-numbers.json" + ["Row #3 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\"" + "Row #4 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'four' as type 'int': For input string: \"four\"" + "Row #5 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'five' as type 'int': For input string: \"five\""]) - (let [{:keys [return-value stdout]} - (capture (sut/inner-main ["-t" "./test/examples/validation/fail-2.csv" - "-u" "./test/examples/validation/named-numbers.json" - "--validate-data"]))] - (t/is (= {:data-validation-errors? true} return-value)) - (t/is (= "Row #3 col #2 (column 'number') has error: Cannot parse 'three' as type 'int': For input string: \"three\"\n" - stdout)))) + (test-validate-data + "./test/examples/validation/fail-4.csv" + "./test/examples/validation/named-numbers.json" + ["Row #3 col #2 (column 'number') in file: fail-4.csv has error: Column value required"]) + (test-validate-data + "./test/examples/validation/success.csv" + "./test/examples/validation/named-numbers-incorrect-schema.json" + ["Row #2 col #1 (column 'name') in file: success.csv has error: Cannot parse 'one' as type 'int': For input string: \"one\"" + "Row #3 col #1 (column 'name') in file: success.csv has error: Cannot parse 'two' as type 'int': For input string: \"two\"" + "Row #4 col #1 (column 'name') in file: success.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\"" + "Row #5 col #1 (column 'name') in file: success.csv has error: Cannot parse 'four' as type 'int': For input string: \"four\"" + "Row #6 col #1 (column 'name') in file: success.csv has error: Cannot parse 'five' as type 'int': For input string: \"five\""]))) diff --git a/test/examples/validation/fail-3.csv b/test/examples/validation/fail-3.csv new file mode 100644 index 00000000..c7c2921c --- /dev/null +++ b/test/examples/validation/fail-3.csv @@ -0,0 +1,5 @@ +name,number +one,1 +3,three +four,four +five,five diff --git a/test/examples/validation/fail-4.csv b/test/examples/validation/fail-4.csv new file mode 100644 index 00000000..ca52a71f --- /dev/null +++ b/test/examples/validation/fail-4.csv @@ -0,0 +1,3 @@ +name,number +one,1 +2, diff --git a/test/examples/validation/named-numbers-incorrect-schema.json b/test/examples/validation/named-numbers-incorrect-schema.json new file mode 100644 index 00000000..4fc28fcf --- /dev/null +++ b/test/examples/validation/named-numbers-incorrect-schema.json @@ -0,0 +1,17 @@ +{ + "@context": "http://www.w3.org/ns/csvw", + "tableSchema": { + "columns": [ + { + "name": "name", + "datatype": "int", + "required": true + }, + { + "name": "number", + "required": true, + "datatype": "int" + } + ] + } +} diff --git a/test/examples/validation/named-numbers.json b/test/examples/validation/named-numbers.json index e4083bbd..b540220c 100644 --- a/test/examples/validation/named-numbers.json +++ b/test/examples/validation/named-numbers.json @@ -1,6 +1,5 @@ { "@context": "http://www.w3.org/ns/csvw", - "url": "fail-2.csv", "tableSchema": { "columns": [ {