Add extra context to the validation error logging

Also: * ensure it is only printed once * add a couple of extra test cases
Swirrl · Nov 30, 2023 · 393e1eb · 393e1eb
1 parent 5dc1bf3
commit 393e1eb
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 64 deletions.
diff --git a/src/csv2rdf/csvw.clj b/src/csv2rdf/csvw.clj
@@ -23,7 +23,7 @@
 (defn annotate-tables [tabular-source metadata-source]
   (processing/get-metadata tabular-source metadata-source))
 
-(defn- validate-rows
+(defn- rows-are-valid?
   "Validates the CSVW schema for the given tabular file, metadata and options.
 
   `tabular-source` and `metadata-source` can be any of the following
@@ -36,20 +36,22 @@
 
   If metadata-source is non-nil then processing will start from the
   asscociated metadata document, otherwise it will start from
-  tabular-source."
+  tabular-source.
+
+  Returns true if validation errors were detected"
   [tabular-source metadata-source]
   (let [{:keys [tables] :as metadata} (processing/get-metadata tabular-source metadata-source)
         table-group-dialect (:dialect metadata)
         output-tables (remove properties/suppress-output? tables)
-        ;;ctx (table-group-context mode metadata)  ;; TODO this might be useful later when iterating over tables
-        ]
-
-    (util/liberal-mapcat (fn [{:keys [url dialect] :as table}]
-                           ;;(validated-rows ctx table table-group-dialect)
-                           (let [dialect (or dialect table-group-dialect)]
-                             (csv/annotated-rows url table dialect)))
+        results (util/liberal-mapcat (fn [{:keys [url dialect] :as table}]
+                                       (let [dialect (or dialect table-group-dialect)]
+                                         (csv/annotated-rows url table dialect)))
 
-                         output-tables)))
+                                     output-tables)]
+    (->> results
+         (mapcat :cells)
+         (mapcat :errors)
+         empty?)))
 
 (defn only-validate-schema
   "Only validate the data against the schemas in the metadata file, and
@@ -58,14 +60,7 @@
   Returns a map with the key `:data-validation-errors?` set to a
   boolean indicating whether any schema errors occurred."
   [{:keys [tabular-source metadata-source]}]
-  (let [errors? (atom false)]
-    (doseq [{:keys [cells] row-number :source-number :as _row} (validate-rows tabular-source metadata-source)
-            {:keys [errors column-number column] :as _cell} cells
-            :when (seq errors)]
-      (reset! errors? true)
-      (doseq [error errors]
-        (println (format "Row #%d col #%d (column '%s') has error: " row-number column-number (:name column)) error)))
-    {:data-validation-errors? @errors?}))
+  {:data-validation-errors? (rows-are-valid? tabular-source metadata-source)})
 
 (defn csv->rdf
   "Runs the CSVW process for the given tabular or metadata data sources

diff --git a/src/csv2rdf/logging.clj b/src/csv2rdf/logging.clj
@@ -29,8 +29,11 @@
   (warn [_this _msg])
   (error [_this _msg]))
 
-(defn memory-logger []
-  (->MemoryLogger (atom []) (atom [])))
+(defn memory-logger
+  ([]
+   (memory-logger (atom []) (atom [])))
+  ([warnings errors]
+   (->MemoryLogger warnings errors)))
 
 (def ^:dynamic *logger* (->ForwardingLogger))
 

diff --git a/src/csv2rdf/tabular/csv.clj b/src/csv2rdf/tabular/csv.clj
@@ -1,5 +1,6 @@
 (ns csv2rdf.tabular.csv
-  (:require [clojure.string :as string]
+  (:require [clojure.string :as str]
+            [clojure.string :as string]
             [csv2rdf.metadata.dialect :as dialect]
             [csv2rdf.tabular.csv.reader :as reader]
             [csv2rdf.metadata.table :as table]
@@ -29,7 +30,7 @@
                                (column/from-titles idx titles default-lang))
                              titles)]
     {:comments (mapv :comment comment-rows)
-     :columns columns}))
+     :columns  columns}))
 
 (defn ^{:tabular-spec "8.10.3"} validate-data-rows
   "Validates the data rows in the tabular file and extracts any embedded comments. The row number of any rows
@@ -102,9 +103,13 @@
                                 cell))
                             parsed-cells)))
 
-(defn parse-row-cells [{:keys [cells] :as row} table {:keys [skipColumns] :as options}]
+(defn parse-row-cells
+  [{:keys [cells source-row-number] :as row}
+   {:keys [url] :as table}
+   {:keys [skipColumns]}]
   (let [columns (table/columns table)
-        cell-values (concat (drop skipColumns cells) (repeat "")) ;;extend cells to cover any virtual columns
+        ;;extend cells to cover any virtual columns
+        cell-values (concat (drop skipColumns cells) (repeat ""))
         cell-column-pairs (map vector cell-values columns)
         parsed-cells (map-indexed (fn [col-idx [cell column]]
                                     (let [result (cell/parse-cell cell column)
@@ -115,8 +120,16 @@
                                         :column column)))
                                   cell-column-pairs)]
     ;;log cell errors
-    (doseq [err (mapcat cell/errors parsed-cells)]
-      (logging/log-warning err))
+    (doseq [{:keys [errors column-number column]} parsed-cells
+            :when (seq errors)]
+      (doseq [error errors]
+        (logging/log-warning
+          (format "Row #%s col #%s (column '%s') in file: %s has error: %s"
+                  source-row-number
+                  column-number
+                  (:name column)
+                  (last (str/split (.toString url) #"/"))
+                  error))))
 
     (assoc row :parsed-cells parsed-cells)))
 
@@ -130,14 +143,14 @@
     (assoc column-value-bindings :_row number :_sourceRow source-row-number)))
 
 (defn get-cell-template-bindings [{:keys [column-number source-column-number column] :as cell}]
-  {:_name (util/percent-decode (properties/column-name column))
-   :_column column-number
+  {:_name         (util/percent-decode (properties/column-name column))
+   :_column       column-number
    :_sourceColumn source-column-number})
 
 (defn get-cell-urls [bindings table {:keys [column] :as cell}]
-  (let [property-urls {:aboutUrl (some-> (properties/about-url column) (template-property/resolve-uri-template-property bindings table))
+  (let [property-urls {:aboutUrl    (some-> (properties/about-url column) (template-property/resolve-uri-template-property bindings table))
                        :propertyUrl (some-> (properties/property-url column) (template-property/resolve-uri-template-property bindings table))
-                       :valueUrl (some-> (properties/value-url column) (template-property/resolve-value-uri-template-property cell column bindings table))}]
+                       :valueUrl    (some-> (properties/value-url column) (template-property/resolve-value-uri-template-property cell column bindings table))}]
     (util/filter-values some? property-urls)))
 
 (defn annotate-row [{:keys [number source-row-number parsed-cells] :as data-row} table title-column-indexes]
@@ -149,10 +162,10 @@
                         property-urls (get-cell-urls bindings table cell)]
                     (merge cell property-urls)))
                 parsed-cells)]
-    {:number number
+    {:number        number
      :source-number source-row-number
      :cells         (vec cells)
-     :titles (get-row-titles title-column-indexes parsed-cells)}))
+     :titles        (get-row-titles title-column-indexes parsed-cells)}))
 
 (defn skip-to-data-rows [rows {:keys [skipRows num-header-rows] :as options}]
   (let [row-offset (+ skipRows num-header-rows)]

diff --git a/test/csv2rdf/main_test.clj b/test/csv2rdf/main_test.clj
@@ -1,41 +1,58 @@
 (ns csv2rdf.main-test
-  (:require [csv2rdf.main :as sut]
+  (:require [csv2rdf.logging :as logging]
+            [csv2rdf.main :as sut]
             [clojure.test :as t]))
 
 ;; See issue 47
 ;; Resolving template property URIs with values containing spaces should work
 
-(defmacro capture
-  "Capture return value of body and stdout, and return a hashmap
-  of :return-value and :stdout."
-  [body]
-  `(let [s# (new java.io.StringWriter)]
-     (binding [*out* s#]
-       (let [ret# ~body]
-         {:return-value ret#
-          :stdout (str s#)}))))
+
+(defn test-validate-data [tabular-file
+                          metadata-file
+                          failures]
+  (let [is-valid? (empty? failures)
+        warnings (atom [])
+        errors (atom [])]
+    (logging/with-logger
+      (logging/memory-logger warnings errors)
+      (t/is (= (sut/inner-main ["-t" tabular-file
+                                "-u" metadata-file
+                                "--validate-data"])
+               {:data-validation-errors? is-valid?}))
+      (t/is (= @warnings failures))
+      (t/is (= @errors [])))))
 
 (t/deftest inner-main-test-validate-data
-  (t/testing "--validate-data")
-  (let [{:keys [return-value stdout]}
-        (capture (sut/inner-main ["-t" "./test/examples/validation/success.csv"
-                                  "-u" "./test/examples/validation/named-numbers.json"
-                                  "--validate-data"]))]
-    (t/is (= {:data-validation-errors? false} return-value))
-    (t/is (= "" stdout)))
+  (t/testing "--validate-data"
 
-  (let [{:keys [return-value stdout]}
-        (capture (sut/inner-main ["-t" "./test/examples/validation/fail-1.csv"
-                                  "-u" "./test/examples/validation/named-numbers.json"
-                                  "--validate-data"]))]
-    (t/is (= {:data-validation-errors? true} return-value))
-    (t/is (= "Row #3 col #2 (column 'number') has error:  Cannot parse 'two' as type 'int': For input string: \"two\"\n"
-             stdout)))
+    (test-validate-data
+      "./test/examples/validation/success.csv"
+      "./test/examples/validation/named-numbers.json"
+      [])
+    (test-validate-data
+      "./test/examples/validation/fail-1.csv"
+      "./test/examples/validation/named-numbers.json"
+      ["Row #3 col #2 (column 'number') in file: fail-1.csv has error: Cannot parse 'two' as type 'int': For input string: \"two\""])
+    (test-validate-data
+      "./test/examples/validation/fail-2.csv"
+      "./test/examples/validation/named-numbers.json"
+      ["Row #3 col #2 (column 'number') in file: fail-2.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\""])
+    (test-validate-data
+      "./test/examples/validation/fail-3.csv"
+      "./test/examples/validation/named-numbers.json"
+      ["Row #3 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\""
+       "Row #4 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'four' as type 'int': For input string: \"four\""
+       "Row #5 col #2 (column 'number') in file: fail-3.csv has error: Cannot parse 'five' as type 'int': For input string: \"five\""])
 
-  (let [{:keys [return-value stdout]}
-        (capture (sut/inner-main ["-t" "./test/examples/validation/fail-2.csv"
-                                  "-u" "./test/examples/validation/named-numbers.json"
-                                  "--validate-data"]))]
-    (t/is (= {:data-validation-errors? true} return-value))
-    (t/is (= "Row #3 col #2 (column 'number') has error:  Cannot parse 'three' as type 'int': For input string: \"three\"\n"
-             stdout))))
+    (test-validate-data
+      "./test/examples/validation/fail-4.csv"
+      "./test/examples/validation/named-numbers.json"
+      ["Row #3 col #2 (column 'number') in file: fail-4.csv has error: Column value required"])
+    (test-validate-data
+      "./test/examples/validation/success.csv"
+      "./test/examples/validation/named-numbers-incorrect-schema.json"
+      ["Row #2 col #1 (column 'name') in file: success.csv has error: Cannot parse 'one' as type 'int': For input string: \"one\""
+       "Row #3 col #1 (column 'name') in file: success.csv has error: Cannot parse 'two' as type 'int': For input string: \"two\""
+       "Row #4 col #1 (column 'name') in file: success.csv has error: Cannot parse 'three' as type 'int': For input string: \"three\""
+       "Row #5 col #1 (column 'name') in file: success.csv has error: Cannot parse 'four' as type 'int': For input string: \"four\""
+       "Row #6 col #1 (column 'name') in file: success.csv has error: Cannot parse 'five' as type 'int': For input string: \"five\""])))
diff --git a/test/examples/validation/fail-3.csv b/test/examples/validation/fail-3.csv
@@ -0,0 +1,5 @@
+name,number
+one,1
+3,three
+four,four
+five,five
diff --git a/test/examples/validation/fail-4.csv b/test/examples/validation/fail-4.csv
@@ -0,0 +1,3 @@
+name,number
+one,1
+2,
diff --git a/test/examples/validation/named-numbers-incorrect-schema.json b/test/examples/validation/named-numbers-incorrect-schema.json
@@ -0,0 +1,17 @@
+{
+ "@context": "http://www.w3.org/ns/csvw",
+ "tableSchema": {
+                 "columns": [
+                             {
+                              "name": "name",
+                              "datatype": "int",
+                              "required": true
+                              },
+                             {
+                              "name": "number",
+                              "required": true,
+                              "datatype": "int"
+                              }
+                             ]
+                 }
+}
diff --git a/test/examples/validation/named-numbers.json b/test/examples/validation/named-numbers.json
@@ -1,6 +1,5 @@
 {
  "@context": "http://www.w3.org/ns/csvw",
- "url": "fail-2.csv",
  "tableSchema": {
                  "columns": [
                              {