diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 29baf6db..b09f6e3d 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -19,7 +19,7 @@ jobs:
- run: |
npm install
npm run generate
- working-directory: tests
+ working-directory: dev
- name: clone gh-pages and clean-up
if: ${{ env.GITHUB_REF_SLUG == 'master' }}
run: |
@@ -31,8 +31,8 @@ jobs:
if: ${{ env.GITHUB_REF_SLUG != 'master' }}
run: mkdir gh-pages
- run: |
- cp tests/docs.html index.html
- cp tests/processes.json processes.json
+ cp dev/docs.html index.html
+ cp dev/processes.json processes.json
rsync -vrm --include='*.json' --include='*.html' --include='meta/***' --include='proposals/***' --exclude='*' . gh-pages
- name: deploy to root (master)
uses: peaceiris/actions-gh-pages@v3
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index b108eb18..25659365 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,8 +8,8 @@ jobs:
with:
node-version: 'lts/*'
- uses: actions/checkout@v3
- - name: Run tests
+ - name: Run linter
run: |
npm install
- npm run test
- working-directory: tests
\ No newline at end of file
+ npm test
+ working-directory: dev
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 407447dc..bb7814c5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,7 +16,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `filter_vector`
- `flatten_dimensions`
- `load_geojson`
+ - `load_ml_model`
- `load_url`
+ - `ml_fit_class_random_forest`
+ - `ml_fit_regr_random_forest`
+ - `ml_fit_class-xgboost`
+ - `ml_predict`
+ - `save_ml_model`
- `unflatten_dimension`
- `vector_buffer`
- `vector_reproject`
diff --git a/dev/.gitignore b/dev/.gitignore
new file mode 100644
index 00000000..e29b5fbe
--- /dev/null
+++ b/dev/.gitignore
@@ -0,0 +1,3 @@
+/node_modules/
+/package-lock.json
+/processes.json
diff --git a/dev/.words b/dev/.words
new file mode 100644
index 00000000..1471ac26
--- /dev/null
+++ b/dev/.words
@@ -0,0 +1,57 @@
+0-to-9
+1-to-0
+anno
+behavior
+boolean
+center
+centers
+dekad
+DEM-based
+Domini
+gamma0
+GeoJSON
+FeatureCollections
+labeled
+MathWorld
+n-ary
+neighbor
+neighborhood
+neighborhoods
+openEO
+orthorectification
+orthorectified
+radiometrically
+reflectances
+reproject
+reprojected
+Reprojects
+resample
+resampled
+resamples
+Resamples
+resampling
+Sentinel-2
+Sentinel-2A
+Sentinel-2B
+signum
+STAC
+catalog
+Catalog
+summand
+UDFs
+gdalwarp
+Lanczos
+sinc
+interpolants
+Breiman
+Hyndman
+date1
+date2
+favor
+XGBoost
+Chen
+Guestrin
+early_stopping_rounds
+Subsample
+hessian
+overfitting
diff --git a/dev/README.md b/dev/README.md
new file mode 100644
index 00000000..fc2382fe
--- /dev/null
+++ b/dev/README.md
@@ -0,0 +1,30 @@
+# Tests for openEO Processes
+
+To run the tests follow these steps:
+
+1. Install [node and npm](https://nodejs.org) - should run with any recent version
+2. Run `npm install` in this folder to install the dependencies
+3. Run the tests with `npm test`. This will also lint the files and verify it follows best practices.
+4. To show the files nicely formatted in a web browser, run `npm start`. It starts a server and opens the corresponding page in a web browser.
+
+## Development processes
+
+All new processes must be added to the `proposals` folder. Each process must be declared to be `experimental`.
+Processes must comply to best practices, which ensure a certain degree of consistency.
+`npm test` will validate and lint the processes and also ensure the best practices are applied.
+
+The linting checks that the files are named correctly, that the content is correctly formatted and indented (JSON and embedded CommonMark).
+The best practices ensure that for examples the fields are not too short and also not too long for example.
+
+A spell check is also checking the texts. It may report names and rarely used technical words as errors.
+If you are sure that these are correct, you can add them to the `.words` file to exclude the word from being reported as an error.
+The file must contain one word per line.
+
+New processes should be added via GitHub Pull Requests.
+
+## Subtype schemas
+
+Sometimes it is useful to define a new "data type" on top of the JSON types (number, string, array, object, ...).
+For example, a client could make a select box with all collections available by adding a subtype `collection-id` to the JSON type `string`.
+If you think a new subype should be added, you need to add it to the `meta/subtype-schemas.json` file.
+It must be a valid JSON Schema. The tests mentioned above will also verify to a certain degree that the subtypes are defined correctly.
diff --git a/dev/docs.html b/dev/docs.html
new file mode 100644
index 00000000..04b1c192
--- /dev/null
+++ b/dev/docs.html
@@ -0,0 +1,125 @@
+
+
+
+
+
+ openEO API Processes
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/dev/package.json b/dev/package.json
new file mode 100644
index 00000000..1da8693f
--- /dev/null
+++ b/dev/package.json
@@ -0,0 +1,30 @@
+{
+ "name": "@openeo/processes",
+ "version": "2.0.0-rc.1",
+ "author": "openEO Consortium",
+ "contributors": [
+ {
+ "name": "Matthias Mohr"
+ }
+ ],
+ "license": "Apache-2.0",
+ "description": "Validates the processes specified in this repository.",
+ "homepage": "http://openeo.org",
+ "bugs": {
+ "url": "https://github.com/Open-EO/openeo-processes/issues"
+ },
+ "repository": {
+ "type": "git",
+ "url": "git+https://github.com/Open-EO/openeo-processes.git"
+ },
+ "devDependencies": {
+ "@openeo/processes-lint": "^0.1.5",
+ "concat-json-files": "^1.1.0",
+ "http-server": "^14.1.1"
+ },
+ "scripts": {
+ "test": "openeo-processes-lint testConfig.json",
+ "generate": "concat-json-files \"../{*,proposals/*}.json\" -t \"processes.json\"",
+ "start": "npm run generate && http-server -p 9876 -o docs.html -c-1"
+ }
+}
diff --git a/dev/testConfig.json b/dev/testConfig.json
new file mode 100644
index 00000000..9b5fbcb2
--- /dev/null
+++ b/dev/testConfig.json
@@ -0,0 +1,14 @@
+{
+ "folder": "../",
+ "proposalsFolder": "../proposals/",
+ "ignoredWords": ".words",
+ "anyOfRequired": [
+ "array_element",
+ "quantiles"
+ ],
+ "subtypeSchemas": "../meta/subtype-schemas.json",
+ "checkSubtypeSchemas": true,
+ "forbidDeprecatedTypes": false,
+ "checkProcessLinks": true,
+ "verbose": false
+}
diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json
index 347df234..83ce72ba 100644
--- a/meta/subtype-schemas.json
+++ b/meta/subtype-schemas.json
@@ -232,6 +232,12 @@
}
}
},
+ "ml-model": {
+ "type": "object",
+ "subtype": "ml-model",
+ "title": "Machine Learning Model",
+ "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension."
+ },
"output-format": {
"type": "string",
"subtype": "output-format",
diff --git a/ml_fit_class_xgboost.json b/ml_fit_class_xgboost.json
new file mode 100644
index 00000000..cced25b1
--- /dev/null
+++ b/ml_fit_class_xgboost.json
@@ -0,0 +1,115 @@
+{
+ "id": "ml_fit_class_xgboost",
+ "summary": "Train an XGBoost classification model",
+ "description": "Fit an XGBoost classification model to training data. XGBoost is a high-performance, flexible, and portable distributed gradient boosting library. It implements machine learning algorithms within the Gradient Boosting framework, featuring parallel tree boosting for efficiency",
+ "categories": [
+ "machine learning"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "predictors",
+ "description": "The predictors for the XGBoost classification model as a vector data cube. They are the independent variables that the XGBoost algorithm analyses to learn patterns and relationships within the data.",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ },
+ {
+ "type": "bands"
+ }
+ ]
+ }
+ },
+ {
+ "name": "target",
+ "description": "Labeled data for XGBoost classification, aligning with predictor values based on a shared geometry dimension. This ensures a clear connection between predictor rows and labels.",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ }
+ ]
+ }
+ },
+ {
+ "name": "learning_rate",
+ "description": "Step size shrinkage used in update to prevent overfitting.",
+ "schema": {
+ "type": "number",
+ "minimum": 0,
+ "default": 0.15
+ }
+ },
+ {
+ "name": "max_depth",
+ "description": "Maximum depth of a tree.",
+ "schema": {
+ "type": "integer",
+ "minimum": 1,
+ "default": 5
+ }
+ },
+ {
+ "name": "min_child_weight",
+ "description": "Minimum sum of instance weight (hessian) needed in a child.",
+ "schema": {
+ "type": "number",
+ "minimum": 0,
+ "default": 1
+ }
+ },
+ {
+ "name": "subsample",
+ "description": "Subsample ratio of the training instance.",
+ "optional": true,
+ "default": 0.8,
+ "schema": {
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1
+ }
+ },
+ {
+ "name": "min_split_loss",
+ "description": "Minimum loss reduction required to make a further partition on a leaf node of the tree.",
+ "optional": true,
+ "default": 1,
+ "schema": {
+ "type": "number",
+ "minimum": 0
+ }
+ },
+ {
+ "name": "seed",
+ "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+ "optional": true,
+ "default": null,
+ "schema": {
+ "type": [
+ "integer",
+ "null"
+ ]
+ }
+ }
+ ],
+ "returns": {
+ "description": "A model object that can be saved with `save_ml_model()` and restored with `load_ml_model()`.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ "links": [
+ {
+ "href": "https://dl.acm.org/doi/10.1145/2939672.2939785",
+ "title": "Chen and Guestrin (2016), XGBoost: A Scalable Tree Boosting System",
+ "type": "text/html",
+ "rel": "about"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json
new file mode 100644
index 00000000..7fa86d89
--- /dev/null
+++ b/proposals/load_ml_model.json
@@ -0,0 +1,46 @@
+{
+ "id": "load_ml_model",
+ "summary": "Load a ML model",
+ "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.",
+ "categories": [
+ "machine learning",
+ "import"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "uri",
+ "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.",
+ "schema": [
+ {
+ "title": "URL",
+ "type": "string",
+ "format": "uri",
+ "subtype": "uri",
+ "pattern": "^https?://"
+ },
+ {
+ "title": "User-uploaded File",
+ "type": "string",
+ "subtype": "file-path",
+ "pattern": "^[^\r\n\\:'\"]+$"
+ }
+ ]
+ }
+ ],
+ "returns": {
+ "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ "links": [
+ {
+ "href": "https://github.com/stac-extensions/ml-model",
+ "title": "STAC ml-model extension",
+ "type": "text/html",
+ "rel": "about"
+ }
+ ]
+}
diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/ml_fit_class_random_forest.json
new file mode 100644
index 00000000..63da48a1
--- /dev/null
+++ b/proposals/ml_fit_class_random_forest.json
@@ -0,0 +1,110 @@
+{
+ "id": "ml_fit_class_random_forest",
+ "summary": "Train a random forest classification model",
+ "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).",
+ "categories": [
+ "machine learning"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "predictors",
+ "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+ "schema": [
+ {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ },
+ {
+ "type": "bands"
+ }
+ ]
+ },
+ {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ },
+ {
+ "type": "other"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "target",
+ "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ }
+ ]
+ }
+ },
+ {
+ "name": "max_variables",
+ "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.",
+ "schema": [
+ {
+ "type": "integer",
+ "minimum": 1
+ },
+ {
+ "type": "string",
+ "enum": [
+ "all",
+ "log2",
+ "onethird",
+ "sqrt"
+ ]
+ }
+ ]
+ },
+ {
+ "name": "num_trees",
+ "description": "The number of trees build within the Random Forest classification.",
+ "optional": true,
+ "default": 100,
+ "schema": {
+ "type": "integer",
+ "minimum": 1
+ }
+ },
+ {
+ "name": "seed",
+ "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+ "optional": true,
+ "default": null,
+ "schema": {
+ "type": [
+ "integer",
+ "null"
+ ]
+ }
+ }
+ ],
+ "returns": {
+ "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ "links": [
+ {
+ "href": "https://doi.org/10.1023/A:1010933404324",
+ "title": "Breiman (2001): Random Forests",
+ "type": "text/html",
+ "rel": "about"
+ }
+ ]
+}
diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/ml_fit_regr_random_forest.json
new file mode 100644
index 00000000..39207324
--- /dev/null
+++ b/proposals/ml_fit_regr_random_forest.json
@@ -0,0 +1,110 @@
+{
+ "id": "ml_fit_regr_random_forest",
+ "summary": "Train a random forest regression model",
+ "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).",
+ "categories": [
+ "machine learning"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "predictors",
+ "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.",
+ "schema": [
+ {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ },
+ {
+ "type": "bands"
+ }
+ ]
+ },
+ {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ },
+ {
+ "type": "other"
+ }
+ ]
+ }
+ ]
+ },
+ {
+ "name": "target",
+ "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "geometry"
+ }
+ ]
+ }
+ },
+ {
+ "name": "max_variables",
+ "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.",
+ "schema": [
+ {
+ "type": "integer",
+ "minimum": 1
+ },
+ {
+ "type": "string",
+ "enum": [
+ "all",
+ "log2",
+ "onethird",
+ "sqrt"
+ ]
+ }
+ ]
+ },
+ {
+ "name": "num_trees",
+ "description": "The number of trees build within the Random Forest regression.",
+ "optional": true,
+ "default": 100,
+ "schema": {
+ "type": "integer",
+ "minimum": 1
+ }
+ },
+ {
+ "name": "seed",
+ "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.",
+ "optional": true,
+ "default": null,
+ "schema": {
+ "type": [
+ "integer",
+ "null"
+ ]
+ }
+ }
+ ],
+ "returns": {
+ "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ "links": [
+ {
+ "href": "https://doi.org/10.1023/A:1010933404324",
+ "title": "Breiman (2001): Random Forests",
+ "type": "text/html",
+ "rel": "about"
+ }
+ ]
+}
diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json
new file mode 100644
index 00000000..87cd2500
--- /dev/null
+++ b/proposals/ml_predict.json
@@ -0,0 +1,49 @@
+{
+ "id": "ml_predict",
+ "summary": "Predict using ML",
+ "description": "Applies a machine learning model to a data cube of input features and returns the predicted values.",
+ "categories": [
+ "machine learning"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "data",
+ "description": "The data cube containing the input features.",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube"
+ }
+ },
+ {
+ "name": "model",
+ "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ {
+ "name": "dimensions",
+ "description": "Zero or more dimensions that will be reduced by the model. Fails with a `DimensionNotAvailable` exception if one of the specified dimensions does not exist.",
+ "schema": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ }
+ ],
+ "returns": {
+ "description": "A data cube with the predicted values. It removes the specified dimensions and adds new dimension for the predicted values. It has the name `predictions` and is of type `other`. If a single value is returned, the dimension has a single label with name `0`.",
+ "schema": {
+ "type": "object",
+ "subtype": "datacube",
+ "dimensions": [
+ {
+ "type": "other"
+ }
+ ]
+ }
+ }
+}
diff --git a/proposals/predict_curve.json b/proposals/predict_curve.json
index 479b7fec..c4d78d99 100644
--- a/proposals/predict_curve.json
+++ b/proposals/predict_curve.json
@@ -1,7 +1,7 @@
{
"id": "predict_curve",
- "summary": "Predict values",
- "description": "Predict values using a model function and pre-computed parameters. The process is intended to compute values for new labels.",
+ "summary": "Predict values using a model function",
+ "description": "Predict values using a model function and pre-computed parameters. The process is primarily intended to compute values for new labels, but it can also fill gaps where existing labels contain no-data (`null`) values.",
"categories": [
"cubes",
"math"
diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json
new file mode 100644
index 00000000..5e9ea8b0
--- /dev/null
+++ b/proposals/save_ml_model.json
@@ -0,0 +1,44 @@
+{
+ "id": "save_ml_model",
+ "summary": "Save a ML model",
+ "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).",
+ "categories": [
+ "machine learning",
+ "import"
+ ],
+ "experimental": true,
+ "parameters": [
+ {
+ "name": "data",
+ "description": "The data to store as a machine learning model.",
+ "schema": {
+ "type": "object",
+ "subtype": "ml-model"
+ }
+ },
+ {
+ "name": "options",
+ "description": "Additional parameters to create the file(s).",
+ "schema": {
+ "type": "object",
+ "additionalParameters": false
+ },
+ "default": {},
+ "optional": true
+ }
+ ],
+ "returns": {
+ "description": "Returns `false` if the process failed to store the model, `true` otherwise.",
+ "schema": {
+ "type": "boolean"
+ }
+ },
+ "links": [
+ {
+ "href": "https://github.com/stac-extensions/ml-model",
+ "title": "STAC ml-model extension",
+ "type": "text/html",
+ "rel": "about"
+ }
+ ]
+}
\ No newline at end of file