diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 29baf6db..b09f6e3d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -19,7 +19,7 @@ jobs: - run: | npm install npm run generate - working-directory: tests + working-directory: dev - name: clone gh-pages and clean-up if: ${{ env.GITHUB_REF_SLUG == 'master' }} run: | @@ -31,8 +31,8 @@ jobs: if: ${{ env.GITHUB_REF_SLUG != 'master' }} run: mkdir gh-pages - run: | - cp tests/docs.html index.html - cp tests/processes.json processes.json + cp dev/docs.html index.html + cp dev/processes.json processes.json rsync -vrm --include='*.json' --include='*.html' --include='meta/***' --include='proposals/***' --exclude='*' . gh-pages - name: deploy to root (master) uses: peaceiris/actions-gh-pages@v3 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b108eb18..25659365 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,8 +8,8 @@ jobs: with: node-version: 'lts/*' - uses: actions/checkout@v3 - - name: Run tests + - name: Run linter run: | npm install - npm run test - working-directory: tests \ No newline at end of file + npm test + working-directory: dev \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 407447dc..bb7814c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `filter_vector` - `flatten_dimensions` - `load_geojson` + - `load_ml_model` - `load_url` + - `ml_fit_class_random_forest` + - `ml_fit_regr_random_forest` + - `ml_fit_class-xgboost` + - `ml_predict` + - `save_ml_model` - `unflatten_dimension` - `vector_buffer` - `vector_reproject` diff --git a/dev/.gitignore b/dev/.gitignore new file mode 100644 index 00000000..e29b5fbe --- /dev/null +++ b/dev/.gitignore @@ -0,0 +1,3 @@ +/node_modules/ +/package-lock.json +/processes.json diff --git a/dev/.words b/dev/.words new file mode 100644 index 00000000..1471ac26 --- /dev/null +++ b/dev/.words @@ -0,0 +1,57 @@ +0-to-9 +1-to-0 +anno +behavior +boolean +center +centers +dekad +DEM-based +Domini +gamma0 +GeoJSON +FeatureCollections +labeled +MathWorld +n-ary +neighbor +neighborhood +neighborhoods +openEO +orthorectification +orthorectified +radiometrically +reflectances +reproject +reprojected +Reprojects +resample +resampled +resamples +Resamples +resampling +Sentinel-2 +Sentinel-2A +Sentinel-2B +signum +STAC +catalog +Catalog +summand +UDFs +gdalwarp +Lanczos +sinc +interpolants +Breiman +Hyndman +date1 +date2 +favor +XGBoost +Chen +Guestrin +early_stopping_rounds +Subsample +hessian +overfitting diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 00000000..fc2382fe --- /dev/null +++ b/dev/README.md @@ -0,0 +1,30 @@ +# Tests for openEO Processes + +To run the tests follow these steps: + +1. Install [node and npm](https://nodejs.org) - should run with any recent version +2. Run `npm install` in this folder to install the dependencies +3. Run the tests with `npm test`. This will also lint the files and verify it follows best practices. +4. To show the files nicely formatted in a web browser, run `npm start`. It starts a server and opens the corresponding page in a web browser. + +## Development processes + +All new processes must be added to the `proposals` folder. Each process must be declared to be `experimental`. +Processes must comply to best practices, which ensure a certain degree of consistency. +`npm test` will validate and lint the processes and also ensure the best practices are applied. + +The linting checks that the files are named correctly, that the content is correctly formatted and indented (JSON and embedded CommonMark). +The best practices ensure that for examples the fields are not too short and also not too long for example. + +A spell check is also checking the texts. It may report names and rarely used technical words as errors. +If you are sure that these are correct, you can add them to the `.words` file to exclude the word from being reported as an error. +The file must contain one word per line. + +New processes should be added via GitHub Pull Requests. + +## Subtype schemas + +Sometimes it is useful to define a new "data type" on top of the JSON types (number, string, array, object, ...). +For example, a client could make a select box with all collections available by adding a subtype `collection-id` to the JSON type `string`. +If you think a new subype should be added, you need to add it to the `meta/subtype-schemas.json` file. +It must be a valid JSON Schema. The tests mentioned above will also verify to a certain degree that the subtypes are defined correctly. diff --git a/dev/docs.html b/dev/docs.html new file mode 100644 index 00000000..04b1c192 --- /dev/null +++ b/dev/docs.html @@ -0,0 +1,125 @@ + + + + + + openEO API Processes + + + + + + + +
+
+
+ + + + \ No newline at end of file diff --git a/dev/package.json b/dev/package.json new file mode 100644 index 00000000..1da8693f --- /dev/null +++ b/dev/package.json @@ -0,0 +1,30 @@ +{ + "name": "@openeo/processes", + "version": "2.0.0-rc.1", + "author": "openEO Consortium", + "contributors": [ + { + "name": "Matthias Mohr" + } + ], + "license": "Apache-2.0", + "description": "Validates the processes specified in this repository.", + "homepage": "http://openeo.org", + "bugs": { + "url": "https://github.com/Open-EO/openeo-processes/issues" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/Open-EO/openeo-processes.git" + }, + "devDependencies": { + "@openeo/processes-lint": "^0.1.5", + "concat-json-files": "^1.1.0", + "http-server": "^14.1.1" + }, + "scripts": { + "test": "openeo-processes-lint testConfig.json", + "generate": "concat-json-files \"../{*,proposals/*}.json\" -t \"processes.json\"", + "start": "npm run generate && http-server -p 9876 -o docs.html -c-1" + } +} diff --git a/dev/testConfig.json b/dev/testConfig.json new file mode 100644 index 00000000..9b5fbcb2 --- /dev/null +++ b/dev/testConfig.json @@ -0,0 +1,14 @@ +{ + "folder": "../", + "proposalsFolder": "../proposals/", + "ignoredWords": ".words", + "anyOfRequired": [ + "array_element", + "quantiles" + ], + "subtypeSchemas": "../meta/subtype-schemas.json", + "checkSubtypeSchemas": true, + "forbidDeprecatedTypes": false, + "checkProcessLinks": true, + "verbose": false +} diff --git a/meta/subtype-schemas.json b/meta/subtype-schemas.json index 347df234..83ce72ba 100644 --- a/meta/subtype-schemas.json +++ b/meta/subtype-schemas.json @@ -232,6 +232,12 @@ } } }, + "ml-model": { + "type": "object", + "subtype": "ml-model", + "title": "Machine Learning Model", + "description": "A machine learning model, accompanied with STAC metadata that implements the the STAC ml-model extension." + }, "output-format": { "type": "string", "subtype": "output-format", diff --git a/ml_fit_class_xgboost.json b/ml_fit_class_xgboost.json new file mode 100644 index 00000000..cced25b1 --- /dev/null +++ b/ml_fit_class_xgboost.json @@ -0,0 +1,115 @@ +{ + "id": "ml_fit_class_xgboost", + "summary": "Train an XGBoost classification model", + "description": "Fit an XGBoost classification model to training data. XGBoost is a high-performance, flexible, and portable distributed gradient boosting library. It implements machine learning algorithms within the Gradient Boosting framework, featuring parallel tree boosting for efficiency", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the XGBoost classification model as a vector data cube. They are the independent variables that the XGBoost algorithm analyses to learn patterns and relationships within the data.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + } + }, + { + "name": "target", + "description": "Labeled data for XGBoost classification, aligning with predictor values based on a shared geometry dimension. This ensures a clear connection between predictor rows and labels.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "learning_rate", + "description": "Step size shrinkage used in update to prevent overfitting.", + "schema": { + "type": "number", + "minimum": 0, + "default": 0.15 + } + }, + { + "name": "max_depth", + "description": "Maximum depth of a tree.", + "schema": { + "type": "integer", + "minimum": 1, + "default": 5 + } + }, + { + "name": "min_child_weight", + "description": "Minimum sum of instance weight (hessian) needed in a child.", + "schema": { + "type": "number", + "minimum": 0, + "default": 1 + } + }, + { + "name": "subsample", + "description": "Subsample ratio of the training instance.", + "optional": true, + "default": 0.8, + "schema": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + { + "name": "min_split_loss", + "description": "Minimum loss reduction required to make a further partition on a leaf node of the tree.", + "optional": true, + "default": 1, + "schema": { + "type": "number", + "minimum": 0 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with `save_ml_model()` and restored with `load_ml_model()`.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://dl.acm.org/doi/10.1145/2939672.2939785", + "title": "Chen and Guestrin (2016), XGBoost: A Scalable Tree Boosting System", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file diff --git a/proposals/load_ml_model.json b/proposals/load_ml_model.json new file mode 100644 index 00000000..7fa86d89 --- /dev/null +++ b/proposals/load_ml_model.json @@ -0,0 +1,46 @@ +{ + "id": "load_ml_model", + "summary": "Load a ML model", + "description": "Loads a machine learning model from a STAC Item.\n\nSuch a model could be trained and saved as part of a previous batch job with processes such as ``ml_fit_regr_random_forest()`` and ``save_ml_model()``.", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "uri", + "description": "The STAC Item to load the machine learning model from. The STAC Item must implement the `ml-model` extension.", + "schema": [ + { + "title": "URL", + "type": "string", + "format": "uri", + "subtype": "uri", + "pattern": "^https?://" + }, + { + "title": "User-uploaded File", + "type": "string", + "subtype": "file-path", + "pattern": "^[^\r\n\\:'\"]+$" + } + ] + } + ], + "returns": { + "description": "A machine learning model to be used with machine learning processes such as ``ml_predict()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_fit_class_random_forest.json b/proposals/ml_fit_class_random_forest.json new file mode 100644 index 00000000..63da48a1 --- /dev/null +++ b/proposals/ml_fit_class_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "ml_fit_class_random_forest", + "summary": "Train a random forest classification model", + "description": "Executes the fit of a random forest classification based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest classification model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the classification model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the classification model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split.\n- `sqrt`: The square root of the number of variables are considered for each split. This is often the default for classification.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest classification.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_fit_regr_random_forest.json b/proposals/ml_fit_regr_random_forest.json new file mode 100644 index 00000000..39207324 --- /dev/null +++ b/proposals/ml_fit_regr_random_forest.json @@ -0,0 +1,110 @@ +{ + "id": "ml_fit_regr_random_forest", + "summary": "Train a random forest regression model", + "description": "Executes the fit of a random forest regression based on training data. The process does not include a separate split of the data in test, validation and training data. The Random Forest regression model is based on the approach by Breiman (2001).", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "predictors", + "description": "The predictors for the regression model as a vector data cube. Aggregated to the features (vectors) of the target input variable.", + "schema": [ + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "bands" + } + ] + }, + { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + }, + { + "type": "other" + } + ] + } + ] + }, + { + "name": "target", + "description": "The training sites for the regression model as a vector data cube. This is associated with the target variable for the Random Forest model. The geometry has to associated with a value to predict (e.g. fractional forest canopy cover).", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "geometry" + } + ] + } + }, + { + "name": "max_variables", + "description": "Specifies how many split variables will be used at a node.\n\nThe following options are available:\n\n- *integer*: The given number of variables are considered for each split.\n- `all`: All variables are considered for each split.\n- `log2`: The logarithm with base 2 of the number of variables are considered for each split.\n- `onethird`: A third of the number of variables are considered for each split. This is often the default for regression.\n- `sqrt`: The square root of the number of variables are considered for each split.", + "schema": [ + { + "type": "integer", + "minimum": 1 + }, + { + "type": "string", + "enum": [ + "all", + "log2", + "onethird", + "sqrt" + ] + } + ] + }, + { + "name": "num_trees", + "description": "The number of trees build within the Random Forest regression.", + "optional": true, + "default": 100, + "schema": { + "type": "integer", + "minimum": 1 + } + }, + { + "name": "seed", + "description": "A randomization seed to use for the random sampling in training. If not given or `null`, no seed is used and results may differ on subsequent use.", + "optional": true, + "default": null, + "schema": { + "type": [ + "integer", + "null" + ] + } + } + ], + "returns": { + "description": "A model object that can be saved with ``save_ml_model()`` and restored with ``load_ml_model()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + "links": [ + { + "href": "https://doi.org/10.1023/A:1010933404324", + "title": "Breiman (2001): Random Forests", + "type": "text/html", + "rel": "about" + } + ] +} diff --git a/proposals/ml_predict.json b/proposals/ml_predict.json new file mode 100644 index 00000000..87cd2500 --- /dev/null +++ b/proposals/ml_predict.json @@ -0,0 +1,49 @@ +{ + "id": "ml_predict", + "summary": "Predict using ML", + "description": "Applies a machine learning model to a data cube of input features and returns the predicted values.", + "categories": [ + "machine learning" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data cube containing the input features.", + "schema": { + "type": "object", + "subtype": "datacube" + } + }, + { + "name": "model", + "description": "A ML model that was trained with one of the ML training processes such as ``ml_fit_regr_random_forest()``.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "dimensions", + "description": "Zero or more dimensions that will be reduced by the model. Fails with a `DimensionNotAvailable` exception if one of the specified dimensions does not exist.", + "schema": { + "type": "array", + "items": { + "type": "string" + } + } + } + ], + "returns": { + "description": "A data cube with the predicted values. It removes the specified dimensions and adds new dimension for the predicted values. It has the name `predictions` and is of type `other`. If a single value is returned, the dimension has a single label with name `0`.", + "schema": { + "type": "object", + "subtype": "datacube", + "dimensions": [ + { + "type": "other" + } + ] + } + } +} diff --git a/proposals/predict_curve.json b/proposals/predict_curve.json index 479b7fec..c4d78d99 100644 --- a/proposals/predict_curve.json +++ b/proposals/predict_curve.json @@ -1,7 +1,7 @@ { "id": "predict_curve", - "summary": "Predict values", - "description": "Predict values using a model function and pre-computed parameters. The process is intended to compute values for new labels.", + "summary": "Predict values using a model function", + "description": "Predict values using a model function and pre-computed parameters. The process is primarily intended to compute values for new labels, but it can also fill gaps where existing labels contain no-data (`null`) values.", "categories": [ "cubes", "math" diff --git a/proposals/save_ml_model.json b/proposals/save_ml_model.json new file mode 100644 index 00000000..5e9ea8b0 --- /dev/null +++ b/proposals/save_ml_model.json @@ -0,0 +1,44 @@ +{ + "id": "save_ml_model", + "summary": "Save a ML model", + "description": "Saves a machine learning model as part of a batch job.\n\nThe model will be accompanied by a separate STAC Item that implements the [ml-model extension](https://github.com/stac-extensions/ml-model).", + "categories": [ + "machine learning", + "import" + ], + "experimental": true, + "parameters": [ + { + "name": "data", + "description": "The data to store as a machine learning model.", + "schema": { + "type": "object", + "subtype": "ml-model" + } + }, + { + "name": "options", + "description": "Additional parameters to create the file(s).", + "schema": { + "type": "object", + "additionalParameters": false + }, + "default": {}, + "optional": true + } + ], + "returns": { + "description": "Returns `false` if the process failed to store the model, `true` otherwise.", + "schema": { + "type": "boolean" + } + }, + "links": [ + { + "href": "https://github.com/stac-extensions/ml-model", + "title": "STAC ml-model extension", + "type": "text/html", + "rel": "about" + } + ] +} \ No newline at end of file