From 953ea63231a6daaf5852940820ab6865ae2b2507 Mon Sep 17 00:00:00 2001
From: Abhishek Dasgupta <abhidg@trenozoic.net>
Date: Sun, 19 Nov 2023 22:02:23 +0000
Subject: [PATCH] Add manual page

---
 README.md          |  40 ++++++++------
 infer-schema.1     | 135 +++++++++++++++++++++++++++++++++++++++++++++
 infer-schema.1.scd | 112 +++++++++++++++++++++++++++++++++++++
 infer_schema.py    |   1 +
 4 files changed, 271 insertions(+), 17 deletions(-)
 create mode 100644 infer-schema.1
 create mode 100644 infer-schema.1.scd

diff --git a/README.md b/README.md
index b3fca4a..ed9622f 100644
--- a/README.md
+++ b/README.md
@@ -4,27 +4,33 @@ Infer JSON schema from CSV files.
 
 ## Installation
 
-The best way to install is via `pipx`:
+The script can be installed via pip
 
 ```shell
-pipx install infer-schema
+pip install infer-schema
 ```
 
-Currently, infer-schema is a single script without any external dependencies, so
-you can download and move it to somewhere in your PATH (remember to set
-executable bit using `chmod +x`).
+Currently, infer-schema is a single Python 3 script without any external
+dependencies, so you can download it to somewhere in your PATH and make it
+executable:
+
+```shell
+curl https://raw.githubusercontent.com/abhidg/infer-schema/main/infer_schema.py -o infer-schema
+chmod +x infer-schema
+./infer-schema file
+```
 
 ## Usage
 
-infer-schema should work out of the box with any CSV file. There are a few
-options that help you to tune the schema detection based on the specifics of our
-data.
-
-* **`--enum-threshold`**: Threshold of unique items up to which enum categories
-  should be populated in the JSON schema, default = 10
-* **`--enum-fields`**: Forces a certain field to be classed as an enum, useful
-  for including fields that do not meet `enum-threshold` criteria
-* **`--bound-types`**: Types for which bounds should be encoded into the schema,
-  default is numbers, for which minimum / maximum are determined. For strings
-  minLength and maxLength are determined.Set `--bound-types=none` to disable
-  bound detection
+See [infer-schema(1)](infer-schema.1.scd)
+(from a local clone, use `man -l ./infer-schema.1`)
+
+## Development
+
+Install pre-commit to setup ruff linting and formatting.
+
+To generate the man page, [scdoc](https://git.sr.ht/~sircmpwn/scdoc) is required:
+
+```shell
+scdoc < infer-schema.1.scd > infer-schema.1
+```
diff --git a/infer-schema.1 b/infer-schema.1
new file mode 100644
index 0000000..eafe7e0
--- /dev/null
+++ b/infer-schema.1
@@ -0,0 +1,135 @@
+.\" Generated by scdoc 1.11.2
+.\" Complete documentation for this program is not available as a GNU info page
+.ie \n(.g .ds Aq \(aq
+.el       .ds Aq '
+.nh
+.ad l
+.\" Begin generated content:
+.TH "infer-schema" "1" "2023-11-19"
+.P
+.SH NAME
+.P
+infer-schema - generate JSON Schema from CSV files
+.P
+.SH SYNOPSIS
+.P
+\fBinfer-schema\fR file [options]
+.P
+.SH DESCRIPTION
+.P
+The infer-schema utility generates JSON Schema (draft 7) corresponding to a CSV
+file, such that the CSV file is valid against the generated schema.\& The JSON
+Schema is shown on standard output and can be piped or written to a file with
+the \fB--output\fR option.\&
+.P
+The following options are available:
+.P
+\fB-h\fR, \fB--help\fR
+.RS 4
+Show usage information
+.P
+.RE
+\fB--enum-threshold\fR \fIthreshold\fR
+.RS 4
+The \fIthreshold\fR of unique items up to which enum categories should be
+populated in the JSON schema.\& Default threshold is 10.\&
+.P
+.RE
+\fB--enum-fields\fR \fIfields\fR
+.RS 4
+Forces \fIfields\fR (comma-separated) to be classed as an enum, useful for
+including fields that do not meet enum \fIthreshold\fR criteria
+.P
+.RE
+\fB--bound-types\fR \fItypes\fR
+.RS 4
+Comma-separated \fItypes\fR for which bounds should be encoded into the schema,
+default is '\&number,integer'\&, for which minimum / maximum are determined.\& For
+strings minLength and maxLength are determined.\& Set \fB--bound-types\fR=none to
+disable bound detection.\& Allowed bound types are \fBinteger\fR, \fBnumber\fR and
+\fBstring\fR
+.P
+.RE
+\fB--explicit-nulls\fR
+.RS 4
+By default, fields that have null and another type are typed as non-required
+with the non-null type.\& This setting makes the nulls explicit by dual typing
+a field with the non-null type.\&
+.P
+As an example, consider a field '\&count'\& that has the following values
+20,NA,30.\& By default, this field will be typed as '\&integer'\& and will not be
+required.\& With \fB--explicit-nulls\fR set, this will be typed as [integer, null]
+.P
+.RE
+\fB-o\fR \fIoutput\fR, \fB--output\fR \fIoutput\fR
+.RS 4
+Save schema to \fIoutput\fR file
+.P
+.RE
+.SH EXAMPLES
+.P
+Given this CSV file called \fIdates.\&csv\fR
+.P
+.nf
+.RS 4
+date,num_cases
+2022-11-11,4
+2022-11-12,5
+2022-11-13,6
+,10
+2022-11-15,10
+2022-11-16,5
+2022-11-17,3
+2022-11-18,2
+2022-11-19,10
+2022-11-20,11
+2022-11-21,4
+2022-11-22,20
+2022-11-23,
+2022-11-24,9
+2022-11-25,4
+2022-11-26,21
+2022-11-27,99
+2022-11-28,59
+2022-11-30,45
+.fi
+.RE
+.P
+Running '\&infer-schema dates.\&csv'\& gives the following output
+.P
+.nf
+.RS 4
+{
+	"$schema": "https://json-schema\&.org/draft-07/schema",
+	"description": "Description of tests/dates\&.csv",
+	"properties": {
+		"date": {
+			"description": "Description for column date",
+			"format": "date",
+			"type": "string"
+		},
+		"num_cases": {
+			"description": "Description for column num_cases",
+			"maximum": 99,
+			"minimum": 2,
+			"type": "integer"
+		}
+	},
+	"required": [],
+	"title": "JSON Schema for tests/dates\&.csv"
+}
+.fi
+.RE
+.P
+Here we see that infer-schema determines minimum and maximum values for integer
+columns.\& For strings, minLength and maxLength are determined.\& This is controlled
+by the \fB--bound-types\fR setting, which can be set to \fBnone\fR to turn off bounds
+detection.\&
+.P
+By default, any column with upto 10 (default \fB--enum-threshold\fR) unique values
+is considered categorical and expressed as a JSON Schema enum type.\& Columns with
+more than 10 values can be forced to be of enum type by using \fB--enum-fields\fR.\&
+.P
+.SH BUGS
+.P
+Report bugs at \fIhttps://github.\&com/abhidg/infer-schema/issues\fR
diff --git a/infer-schema.1.scd b/infer-schema.1.scd
new file mode 100644
index 0000000..6b77315
--- /dev/null
+++ b/infer-schema.1.scd
@@ -0,0 +1,112 @@
+infer-schema(1)
+
+# NAME
+
+infer-schema - generate JSON Schema from CSV files
+
+# SYNOPSIS
+
+*infer-schema* file [options]
+
+# DESCRIPTION
+
+The infer-schema utility generates JSON Schema (draft 7) corresponding to a CSV
+file, such that the CSV file is valid against the generated schema. The JSON
+Schema is shown on standard output and can be piped or written to a file with
+the *--output* option.
+
+The following options are available:
+
+*-h*, *--help*
+	Show usage information
+
+*--enum-threshold* _threshold_
+	The _threshold_ of unique items up to which enum categories should be
+	populated in the JSON schema. Default threshold is 10.
+
+*--enum-fields* _fields_
+	Forces _fields_ (comma-separated) to be classed as an enum, useful for
+	including fields that do not meet enum _threshold_ criteria
+
+*--bound-types* _types_
+	Comma-separated _types_ for which bounds should be encoded into the schema,
+	default is 'number,integer', for which minimum / maximum are determined. For
+	strings minLength and maxLength are determined. Set *--bound-types*=none to
+	disable bound detection. Allowed bound types are *integer*, *number* and
+	*string*
+
+*--explicit-nulls*
+	By default, fields that have null and another type are typed as non-required
+	with the non-null type. This setting makes the nulls explicit by dual typing
+	a field with the non-null type.
+
+	As an example, consider a field 'count' that has the following values
+	20,NA,30. By default, this field will be typed as 'integer' and will not be
+	required. With *--explicit-nulls* set, this will be typed as [integer, null]
+
+*-o* _output_, *--output* _output_
+	Save schema to _output_ file
+
+# EXAMPLES
+
+Given this CSV file called _dates.csv_
+
+```
+date,num_cases
+2022-11-11,4
+2022-11-12,5
+2022-11-13,6
+,10
+2022-11-15,10
+2022-11-16,5
+2022-11-17,3
+2022-11-18,2
+2022-11-19,10
+2022-11-20,11
+2022-11-21,4
+2022-11-22,20
+2022-11-23,
+2022-11-24,9
+2022-11-25,4
+2022-11-26,21
+2022-11-27,99
+2022-11-28,59
+2022-11-30,45
+```
+
+Running 'infer-schema dates.csv' gives the following output
+
+```
+{
+	"$schema": "https://json-schema.org/draft-07/schema",
+	"description": "Description of tests/dates.csv",
+	"properties": {
+		"date": {
+			"description": "Description for column date",
+			"format": "date",
+			"type": "string"
+		},
+		"num_cases": {
+			"description": "Description for column num_cases",
+			"maximum": 99,
+			"minimum": 2,
+			"type": "integer"
+		}
+	},
+	"required": [],
+	"title": "JSON Schema for tests/dates.csv"
+}
+```
+
+Here we see that infer-schema determines minimum and maximum values for integer
+columns. For strings, minLength and maxLength are determined. This is controlled
+by the *--bound-types* setting, which can be set to *none* to turn off bounds
+detection.
+
+By default, any column with upto 10 (default *--enum-threshold*) unique values
+is considered categorical and expressed as a JSON Schema enum type. Columns with
+more than 10 values can be forced to be of enum type by using *--enum-fields*.
+
+# BUGS
+
+Report bugs at https://github.com/abhidg/infer-schema/issues
diff --git a/infer_schema.py b/infer_schema.py
index ad8f343..86c1dd6 100644
--- a/infer_schema.py
+++ b/infer_schema.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import csv
 import json
 import argparse