diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 811d582..75f3ce4 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -92,21 +92,24 @@ jobs: git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git commit ${NB_PATH[*]} -m "Quarto output" || echo "No changes to commit" - git push origin || echo "No changes to commit" + git push || echo "No changes to commit" - name: Commit Files Pythia Book if: github.ref == 'refs/heads/main' run: | - python src/render_sfinx_toc.py + pip install . + python src/eo_datascience/render_sfinx_toc.py + python src/eo_datascience/clean_nb.py cp ./_toc.yml ./pythia/ cp ./Makefile ./pythia/ + cp ./chapters/references.bib ./pythia/notebooks/ cp -R ./notebooks/. ./pythia/notebooks/ cd ./pythia git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git add . git commit -m "Quarto output" || echo "No changes to commit" - git push origin || echo "No changes to commit" + git push || echo "No changes to commit" - name: Commit Files Quarto Book Branch dev if: github.ref == 'refs/heads/dev' @@ -115,14 +118,17 @@ jobs: git config --local user.name "$GITHUB_ACTOR" git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" git commit ${NB_PATH[*]} -m "Quarto output" || echo "No changes to commit" - git push origin dev || echo "No changes to commit" + git push || echo "No changes to commit" - name: Commit Files Pythia Book Branch dev if: github.ref == 'refs/heads/dev' run: | - python src/render_sfinx_toc.py + pip install . + python src/eo_datascience/render_sfinx_toc.py + python src/eo_datascience/clean_nb.py cp ./_toc.yml ./pythia/ cp ./Makefile ./pythia/ + cp ./chapters/references.bib ./pythia/notebooks/ cp -R ./notebooks/. ./pythia/notebooks/ cd ./pythia git config --local user.name "$GITHUB_ACTOR" diff --git a/.gitignore b/.gitignore index c01a887..8c0afd3 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,8 @@ docs/_rst/* docs/_build/* cover/* MANIFEST +/**/*.html +/**/*_files/ # Per-project virtualenvs .venv*/ diff --git a/Makefile b/Makefile index 84f16e8..9589654 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,6 @@ post-render: for i in $(QMD); do quarto convert $$i; done - mv chapters/*.ipynb notebooks/ >/dev/null 2>&1 - for f in chapters/*.quarto_ipynb ; do mv -- "$f" "${f%.quarto_ipynb}.ipynb" >/dev/null 2>&1; done - python src/clean-nb.py cp Makefile notebooks/ data: diff --git a/chapters/01_classification.qmd b/chapters/01_classification.qmd index a2c95a2..3cf22f1 100644 --- a/chapters/01_classification.qmd +++ b/chapters/01_classification.qmd @@ -1,5 +1,5 @@ --- -title: Classification +title: Classification of Sentinel-2 imagery subtitle: Finding forests with satelite imagery jupyter: kernelspec: diff --git a/chapters/02_floodmapping.qmd b/chapters/02_floodmapping.qmd index 413a4d4..fd2a6c2 100644 --- a/chapters/02_floodmapping.qmd +++ b/chapters/02_floodmapping.qmd @@ -10,6 +10,11 @@ jupyter: ![Image from [wikipedia](https://upload.wikimedia.org/wikipedia/commons/d/d4/Thomas_Bayes.gif)](https://upload.wikimedia.org/wikipedia/commons/d/d4/Thomas_Bayes.gif) +::: {.callout-note} +This notebooks contains interactive element. These interactive element can only be viewed on Binder by clicking on the Binder badge or 🚀 button. +::: + + ```{python} %matplotlib widget diff --git a/chapters/references.bib b/chapters/references.bib index 9362bb7..f2eae64 100644 --- a/chapters/references.bib +++ b/chapters/references.bib @@ -9,7 +9,6 @@ @acelik/colors-methods-and-mistakes-in-spatial-data-visualization-60d02e7f09fd author = {Celik, Anil}, month = aug, year = {2015}, - file = {Snapshot:/home/mschobbe/Zotero/storage/BCKHNHID/colors-methods-and-mistakes-in-spatial-data-visualization-60d02e7f09fd.html:text/html}, } @article{nguyen_examining_2021, @@ -134,11 +133,13 @@ @article{bauer-marschallinger_toward_2019 file = {IEEE Xplore Abstract Record:/home/mschobbe/Zotero/storage/5QJ4V6PY/8444430.html:text/html}, } -@software{quast_getting_2024, - author = {Raphael Quast}, - title = {EOmaps: A python package to visualize and analyze geographical datasets.}, - doi = {10.5281/zenodo.6459598}, - url = {https://doi.org/10.5281/zenodo.6459598} +@article{quast_getting_2024, + author = {Raphael Quast}, + title = {EOmaps: A python package to visualize and analyze geographical datasets.}, + doi = {10.5281/zenodo.6459598}, + url = {https://doi.org/10.5281/zenodo.6459598}, + journal = {}, + year = {2024} } @article{rouse1974monitoring, @@ -157,4 +158,5 @@ @online{nasa2020 year = {2020}, url = {https://earthobservatory.nasa.gov/features/MeasuringVegetation/measuring_vegetation_2.php}, urldate = {2024-07-26} -} \ No newline at end of file +} + diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..3728b1f --- /dev/null +++ b/environment.yml @@ -0,0 +1,9 @@ +name: eo-datascience +channels: + - conda-forge +dependencies: + - python=3.10 + - pip + - mamba + - jupyter + - nbformat \ No newline at end of file diff --git a/notebooks/01_classification.ipynb b/notebooks/01_classification.ipynb index 4700686..5c6a1e2 100644 --- a/notebooks/01_classification.ipynb +++ b/notebooks/01_classification.ipynb @@ -1,593 +1,607 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Classification\n", - "Finding forests with satelite imagery\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: Classification of Sentinel-2 imagery\n", + "subtitle: Finding forests with satelite imagery\n", + "jupyter: \n", + " kernelspec:\n", + " name: \"01_classification\"\n", + " language: \"python\"\n", + " display_name: \"01_classification\"\n", + "format: \n", + " html:\n", + " code-fold: show\n", + "eval: true\n", + "---\n", + "\n", + "## Data Acquisition\n", + "In this chapter, we will employ machine learning techniques to classify a scene using satellite imagery. Specifically, we will utilize ``scikit-learn`` to implement two distinct classifiers and subsequently compare their results. To begin, we need to import the following modules." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "import xarray as xr\n", + "import pystac_client\n", + "import stackstac\n", + "import odc.stac\n", + "import rioxarray\n", + "import geopandas as gpd\n", + "from odc.geo.geobox import GeoBox\n", + "from dask.diagnostics import ProgressBar\n", + "from rasterio.enums import Resampling\n", + "from shapely.geometry import Polygon, mapping\n", + "\n", + "import cmcrameri as cmc\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "# Scikit Learn\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "import matplotlib.colors as colors" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we start, we need to load the data. We will use ``odc-stac`` to obtain data from Earth Search by Element 84. Here we define the area of interest and the time frame, aswell as the EPSG code and the resolution.\n", + "\n", + "### Searching in the Catalog\n", + "The module ``odc-stac`` provides access to free, open source satelite data. To retrieve the data, we must define several parameters that specify the location and time period for the satellite data. Additionally, we must specify the data collection we wish to access, as multiple collections are available. In this example, we will use multispectral imagery from the Sentinel-2 satellite." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "dx = 0.0006 # 60m resolution\n", + "epsg = 4326\n", + "\n", + "# Set Spatial extent\n", + "latmin, latmax = 47.86, 48.407\n", + "lonmin, lonmax = 16.32, 16.9\n", + "bounds = (lonmin, latmin, lonmax, latmax)\n", + "minx, miny, maxx, maxy = bounds\n", + "geom = {\n", + " 'type': 'Polygon',\n", + " 'coordinates': [[\n", + " [minx, miny],\n", + " [minx, maxy],\n", + " [maxx, maxy],\n", + " [maxx, miny],\n", + " [minx, miny]\n", + " ]]\n", + "}\n", + "\n", + "# Set Temporal extent\n", + "year = 2024\n", + "month = 5\n", + "day = 1\n", + "delta = 10\n", + "\n", + "start_date = datetime(year, month, day)\n", + "end_date = start_date + timedelta(days=delta)\n", + "date_query = start_date.strftime(\"%Y-%m-%d\") + \"/\" + end_date.strftime(\"%Y-%m-%d\")\n", + "\n", + "# Search for Sentinel-2 data\n", + "items = pystac_client.Client.open(\n", + " \"https://earth-search.aws.element84.com/v1\"\n", + ").search(\n", + " intersects=geom,\n", + " collections=[\"sentinel-2-l2a\"],\n", + " datetime=date_query,\n", + " limit=100,\n", + ").item_collection()\n", + "print(len(items), 'scenes found')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will now focus on the area south-east of Vienna, where the Nationalpark _Donauauen_ is situated. The time frame we are interested in is the beginning of May 2024.\n", + "After passing these parameters to the `stac-catalog` we have found **10 scenes** that we can use for our analysis. \n", + "\n", + "### Loading the Data\n", + "Now we will load the data directly into an ``xarray`` dataset, which we can use to perform computations on the data. ``xarray`` is a powerful library for working with multi-dimensional arrays, making it well-suited for handling satellite data.\n", + "\n", + "Here's how we can load the data using odc-stac and xarray:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| output: false\n", + "# define a geobox for my region\n", + "geobox = GeoBox.from_bbox(bounds, crs=f\"epsg:{epsg}\", resolution=dx)\n", + "\n", + "# lazily combine items\n", + "ds_odc = odc.stac.load(\n", + " items,\n", + " bands=[\"scl\", \"red\", \"green\", \"blue\", \"nir\"],\n", + " chunks={'time': 5, 'x': 600, 'y': 600},\n", + " geobox=geobox,\n", + " resampling=\"bilinear\",\n", + ")\n", + "\n", + "# actually load it\n", + "ds_odc.load()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Visualization\n", + "### RGB Image\n", + "With the image data now in our possession, we can proceed with computations and visualizations.\n", + "\n", + "First, we define a mask to exclude cloud cover and areas with missing data. Subsequently, we create a composite median image, where each pixel value represents the median value across all the scenes we have identified. This approach helps to eliminate clouds and outliers present in some of the images, thereby providing a clearer and more representative visualization of the scene." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# define a mask for valid pixels (non-cloud)\n", + "def is_valid_pixel(data):\n", + " # include only vegetated, not_vegitated, water, and snow\n", + " return ((data > 3) & (data < 7)) | (data==11)\n", + "\n", + "ds_odc['valid'] = is_valid_pixel(ds_odc.scl)\n", + "#ds_odc.valid.sum(\"time\").plot()\n", + "\n", + "def avg(ds):\n", + " return (ds / ds.max() * 2)\n", + "\n", + "# compute the masked median\n", + "rgb_median = (\n", + " ds_odc[['red', 'green', 'blue']]\n", + " .where(ds_odc.valid)\n", + " .to_dataarray(dim=\"band\")\n", + " .transpose(..., \"band\")\n", + " .median(dim=\"time\")\n", + " .astype(int)\n", + ")\n", + "rgb_comp = avg(rgb_median)\n", + "plot = rgb_comp.plot.imshow(rgb=\"band\", figsize=(8, 8))\n", + "plot.axes.set_title(f\"RGB - Median Composite\\n{start_date.strftime('%d.%m.%Y')} - {end_date.strftime('%d.%m.%Y')}\")\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### False Color Image\n", + "In addition to the regular RGB Image, we can swap any of the bands from the visible spectrum with any other bands. In this specific case the red band has been changed to the near infrared band. This allows us to see vegetated areas more clearly, since they now appear in a bright red color. This is due to the fact that plants absorb regular red light while reflecting near infrared light [@nasa2020]." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# compute the false color image\n", + "fc_median = (\n", + " ds_odc[['nir', 'green', 'blue']]\n", + " .where(ds_odc.valid)\n", + " .to_dataarray(dim=\"band\")\n", + " .transpose(..., \"band\")\n", + " .median(dim=\"time\")\n", + " .astype(int)\n", + ")\n", + "fc_comp = avg(fc_median)\n", + "plot = fc_comp.plot.imshow(rgb=\"band\", figsize=(8, 8))\n", + "plot.axes.set_title(f\"False Color - Median Composite\\n{start_date.strftime('%d.%m.%Y')} - {end_date.strftime('%d.%m.%Y')}\")\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NDVI Image\n", + "To get an first impression of the data, we can calculate the NDVI (Normalized Difference Vegetation Index) and plot it. The NDVI is calculated by useing the following formula. [@rouse1974monitoring]\n", + "\n", + "$$\n", + "NDVI = \\frac{NIR - Red}{NIR + Red}\n", + "$$\n", + "\n", + "This gives us a good overview of the vegetation in the area. The values can range from -1 to 1 where the following meanings are associated with these values:\n", + "\n", + "- -1 to 0 indicate dead plants or inanimate objects\n", + "- 0 to 0.33 are unhealthy plants\n", + "- 0.33 to 0.66 are moderatly healthy plants\n", + "- 0.66 to 1 are very healthy plants" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Normalized Difference Vegetation Index (NDVI)\n", + "def normalized_difference(a, b):\n", + " return (a - b*1.) / (a + b)\n", + "\n", + "ndvi = normalized_difference(ds_odc.nir, ds_odc.red)\n", + "ndvi.median(dim=\"time\").plot.imshow(cmap='cmc.cork', vmin=-1, vmax=1).axes.set_title('NDVI')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification \n", + "In this chapter, we will classify the satellite data to identify forested areas within the scene. By using supervised machine learning techniques, we can train classifiers to distinguish between forested and non-forested regions based on the training data we provide. We will explore two different classifiers and compare their performance in accurately identifying forest areas.\n", + "\n", + "### Regions of Interest\n", + "Since this is a supervised classification, we need to have some training data. Therefore we need to define areas or regions, which we are certain represent the feature which we are classifiying. In this case we are interested in forested areas and regions that are definitly not forested. These regions will be used to train our classifiers." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Define Polygons\n", + "forest_areas = {\n", + " 0: [Polygon([(16.482772, 47.901753), (16.465133, 47.870124), (16.510142, 47.874382), (16.482772, 47.901753)])],\n", + " 1: [Polygon([(16.594079, 47.938855), (16.581914, 47.894454), (16.620233, 47.910268), (16.594079, 47.938855)])],\n", + " 2: [Polygon([(16.67984, 47.978998), (16.637263, 47.971091), (16.660376, 47.929123), (16.67984, 47.978998)])],\n", + " 3: [Polygon([(16.756477, 48.000286), (16.723024, 47.983256), (16.739446, 47.972916), (16.756477, 48.000286)])],\n", + " 4: [Polygon([(16.80696, 48.135923), (16.780806, 48.125583), (16.798445, 48.115243), (16.80696, 48.135923)])],\n", + " 5: [Polygon([(16.684097, 48.144438), (16.664634, 48.124366), (16.690788, 48.118892), (16.684097, 48.144438)])],\n", + " 6: [Polygon([(16.550894, 48.169984), (16.530822, 48.165118), (16.558801, 48.137139), (16.550894, 48.169984)])],\n", + " 7: [Polygon([(16.588604, 48.402329), (16.556976, 48.401112), (16.580697, 48.382865), (16.588604, 48.402329)])],\n", + "}\n", + "\n", + "nonforest_areas = {\n", + " 0: [Polygon([(16.674974, 48.269126), (16.623882, 48.236281), (16.682272, 48.213168), (16.674974, 48.269126)])],\n", + " 1: [Polygon([(16.375723, 48.228374), (16.357476, 48.188839), (16.399444, 48.185798), (16.375723, 48.228374)])],\n", + " 2: [Polygon([(16.457834, 48.26426), (16.418907, 48.267301), (16.440804, 48.23324), (16.457834, 48.26426)])],\n", + " 3: [Polygon([(16.519266, 48.101861), (16.470607, 48.100645), (16.500411, 48.07145), (16.519266, 48.101861)])],\n", + " 4: [Polygon([(16.453577, 48.051986), (16.412217, 48.067192), (16.425598, 48.012451), (16.453577, 48.051986)])],\n", + "}\n", + "\n", + "# Geoppandas Dataframe from Polygons\n", + "forest_df = gpd.GeoDataFrame({'geometry': [poly[0] for poly in forest_areas.values()]}, crs=\"EPSG:4326\")\n", + "nonforest_df = gpd.GeoDataFrame({'geometry': [poly[0] for poly in nonforest_areas.values()]}, crs=\"EPSG:4326\")\n", + "\n", + "\n", + "# Plotting Regions of Interest\n", + "fig, ax = plt.subplots()\n", + "rgb_comp.plot.imshow(ax=ax)\n", + "forest_df.plot(ax=ax, ec='C0', fc='none')\n", + "nonforest_df.plot(ax=ax, ec='C1', fc='none')\n", + "ax.set_title('Regions of Interest')\n", + "ax.set_aspect('equal')\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preparation\n", + "In addition to the Regions of Interest we will extract the specific bands from the loaded dataset that we intend to use for the classification, which are the `red, green, blue` and `near-infrared` bands, although other bands can also be utilized. Using these bands, we will create both a training and a testing dataset. The training dataset will be used to train the classifier, while the testing dataset will be employed to evaluate its performance." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Classifiying dataset (only necessary bands)\n", + "bands = ['red', 'green', 'blue', 'nir']\n", + "ds_class = (\n", + " ds_odc[bands]\n", + " .where(ds_odc.valid)\n", + " .median(dim=\"time\")\n", + ")\n", + "ds_class = avg(ds_class)\n", + "ds_class = ds_class.fillna(0)\n", + "\n", + "def clip_array(ds:xr.Dataset, polygons):\n", + " clipped = ds.rio.clip(polygons, invert=False, all_touched=False, drop=True)\n", + " clipped_nan = clipped.where(clipped == ds)\n", + " return clipped_nan\n", + "\n", + "# Dictionaries with Dataarrays, each clipped by a Polygon\n", + "data_dict_feat = {idx: clip_array(ds_class, polygon) for idx, polygon in forest_areas.items()}\n", + "data_dict_nonfeat = {idx: clip_array(ds_class, polygon) for idx, polygon in nonforest_areas.items()}\n", + "\n", + "# Reshape the polygon dataarrays to get a tuple (one value per band) of pixel values\n", + "feat_data = [xarray.to_array().values.reshape(len(bands),-1).T for xarray in data_dict_feat.values()] # replaced median_data_dict_feat with data_dict_feat\n", + "nonfeat_data = [xarray.to_array().values.reshape(len(bands),-1).T for xarray in data_dict_nonfeat.values()] # replaced median_data_dict_feat with data_dict_feat\n", + "\n", + "# The rows of the different polygons are concatenated to a single array for further processing\n", + "feat_values = np.concatenate(feat_data)\n", + "nonfeat_values = np.concatenate(nonfeat_data)\n", + "\n", + "# Drop Nan Values\n", + "X_feat_data = feat_values[~np.isnan(feat_values).any(axis=1)]\n", + "X_nonfeat_data = nonfeat_values[~np.isnan(nonfeat_values).any(axis=1)]\n", + "\n", + "# Creating Output Vector (1 for pixel is features; 0 for pixel is not feature)\n", + "y_feat_data = np.ones(X_feat_data.shape[0])\n", + "y_nonfeat_data = np.zeros(X_nonfeat_data.shape[0])\n", + "\n", + "# Concatnate all Classes for training \n", + "X = np.concatenate([X_feat_data, X_nonfeat_data])\n", + "y = np.concatenate([y_feat_data, y_nonfeat_data])\n", + "\n", + "# Split into Training and Testing Data.\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have prepared the training and testing data, we will create an image array of the actual scene that we intend to classify. This array will serve as the input for our classification algorithms, allowing us to apply the trained classifiers to the entire scene and identify the forested and non-forested areas accurately." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "image_data = ds_class[bands].to_array(dim='band').transpose('latitude', 'longitude', 'band')\n", + "\n", + "# Reshape the image data\n", + "num_of_pixels = ds_class.sizes['longitude'] * ds_class.sizes['latitude']\n", + "num_of_bands = len(bands)\n", + "X_image_data = image_data.values.reshape(num_of_pixels, num_of_bands)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classifiying with Naive Bayes\n", + "Now that we have prepared all the needed data, we can begin the actual classification process.\n", + "\n", + "We will start with a _Naive Bayes_ classifier. First, we will train the classifier using our training dataset. Once trained, we will apply the classifier to the actual image to identify the forested and non-forested areas.\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Naive Bayes initialization and training\n", + "nb = GaussianNB()\n", + "nb_test = nb.fit(X_train, y_train)\n", + "nb_predict = nb.predict(X_test)\n", + "\n", + "# Prediction on image\n", + "nb_predict_img = nb.predict(X_image_data)\n", + "nb_predict_img = nb_predict_img.reshape(ds_class.sizes['latitude'], ds_class.sizes['longitude'])\n", + "\n", + "# Adding the Naive Bayes Prediction to the dataset\n", + "ds_class['NB-forest'] = xr.DataArray(nb_predict_img, dims=['latitude', 'longitude'], coords={'longitude': ds_class['longitude'], 'latitude': ds_class['latitude']})" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To evaluate the effectiveness of the classification, we will plot the image predicted by the classifier. Additionally, we will examine the ``Classification Report`` and the ``Confusion Matrix`` to gain further insights into the classifier's performance." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Plot Naive Bayes\n", + "alpha = 1\t\n", + "cmap_green = colors.ListedColormap([(1, 1, 1, alpha), 'green'])\n", + "\n", + "plot = ds_class['NB-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", + "cbar = plot.colorbar\n", + "cbar.set_ticklabels(['non-forest', 'forest'])\n", + "plot.axes.set_title('Naive Bayes Classification')\n", + "plt.show()\n", + "\n", + "# Print the Classification report\n", + "print(\"NAIVE BAYES: \\n \"+ classification_report(y_test, nb_predict))\n", + "\n", + "# Print the confusion matrix\n", + "con_mat_nb = pd.DataFrame(confusion_matrix(y_test, nb_predict), \n", + " index=['Actual Negative', 'Actual Positive'], \n", + " columns=['Predicted Negative', 'Predicted Positive'])\n", + "display(con_mat_nb)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classifiying with Random Forest\n", + "To ensure our results are robust, we will explore an additional classifier. In this section, we will use the Random Forest classifier. The procedure for using this classifier is the same as before: we will train the classifier using our training dataset and then apply it to the actual image to classify the scene." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "# Random Forest initialization and training\n", + "rf = RandomForestClassifier(n_estimators=100)\n", + "rf_test = rf.fit(X_train, y_train)\n", + "rf_predict = rf.predict(X_test)\n", + "\n", + "# Prediction on image\n", + "rf_predict_img = rf.predict(X_image_data)\n", + "rf_predict_img = rf_predict_img.reshape(ds_class.sizes['latitude'], ds_class.sizes['longitude'])\n", + "\n", + "# Adding the Random Forest Prediction to the dataset\n", + "ds_class['RF-forest'] = xr.DataArray(rf_predict_img, dims=['latitude', 'longitude'], coords={'longitude': ds_class['longitude'], 'latitude': ds_class['latitude']})\n", + "\n", + "plot = ds_class['RF-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", + "cbar = plot.colorbar\n", + "cbar.set_ticklabels(['non-forest', 'forest'])\n", + "plot.axes.set_title('Random Forest Classification')\n", + "plt.show()\n", + "\n", + "# Print the Classification report\n", + "print(\"RANDOM FOREST: \\n \"+ classification_report(y_test, rf_predict))\n", + "\n", + "# Print the confusion matrix\n", + "con_mat_rf = pd.DataFrame(confusion_matrix(y_test, rf_predict), \n", + " index=['Actual Negative', 'Actual Positive'], \n", + " columns=['Predicted Negative', 'Predicted Positive'])\n", + "display(con_mat_rf)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can already see from the `classification reports` and the `confusion matrices` that the Random Forest classifier has outperformed the Naive Bayes classifier. This is particularly evident from the lower values in the secondary diagonal, indicating minimal False Positives and False Negatives. It appears that the Naive Bayes classifier is more sensitive to False Positives, resulting in a higher rate of incorrect classifications.\n", + "\n", + "### Comparison of the Classificators\n", + "\n", + "To gain a more in-depth understanding of the classifiers' performance, we will compare their results. Specifically, we will identify the areas where both classifiers agree and the areas where they disagree. This comparison will provide valuable insights into the strengths and weaknesses of each classifier, allowing us to better assess their effectiveness in identifying forested and non-forested regions." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| code-fold: true\n", + "cmap_trio = colors.ListedColormap(['whitesmoke' ,'indianred', 'goldenrod', 'darkgreen'])\n", + "\n", + "\n", + "double_clf = (ds_class['NB-forest'] + 2*ds_class['RF-forest'])\n", + "\n", + "fig, ax = plt.subplots()\n", + "cax = ax.imshow(double_clf, cmap=cmap_trio, interpolation='none')\n", + "\n", + "# Add a colorbar with custom tick labels\n", + "cbar = fig.colorbar(cax, ticks=[1*0.375, 3*0.375, 5*0.375, 7*0.375])\n", + "cbar.ax.set_yticklabels(['None', 'Naive Bayes', 'Random Forest', 'Both'])\n", + "ax.set_title('Classification Comparisson')\n", + "ax.set_axis_off()\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The areas where both classifiers agree include the larger forested regions, such as the _Nationalpark Donau-Auen_ and the _Leithagebirge_. Additionally, both classifiers accurately identified the urban areas of Vienna and correctly excluded them from being classified as forested." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| code-fold: true\n", + "# Plot only one class, either None (0), Naive Bayes (1), Random Forest (2), or Both (3)\n", + "fig, axs = plt.subplots(2,2, figsize=(8,8))\n", + "ax = axs.ravel()\n", + "\n", + "for i in range(4):\n", + " ax[i].imshow(double_clf==i, cmap='cmc.oleron_r', interpolation='none')\n", + " category = ['by None', 'only by Naive Bayes', 'only by Random Forest', 'by Both'][i]\n", + " title = 'Areas classified ' + category\n", + " ax[i].set_title(title)\n", + " ax[i].set_axis_off()\n", + "\n", + "plt.tight_layout()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When plotting the classified areas individually, we observe that the Random Forest classifier mistakenly identified the Danube River as a forested area. Conversely, the Naive Bayes classifier erroneously classified a significant amount of cropland as forest.\n", + "\n", + "Finally, by analyzing the proportion of forested areas within the scene, we find that approximately 18% of the area is classified as forest, while around 66% is classified as non-forest. The remaining areas, which include water bodies and cropland, fall into less clearly defined categories.\n", + "\n", + "The accompanying bar chart illustrates the distribution of these classifications, highlighting the percentage of forested areas, non-forested areas, and regions classified by only one of the two classifiers. This visual representation helps to quantify the areas of agreement and disagreement between the classifiers, providing a clearer picture of their performance." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| code-fold: true\n", + "counts = {}\n", + "for num in range(0,4):\n", + " num_2_class = {0: 'None', 1: 'Naive Bayes', 2: 'Random Forest', 3: 'Both'}\n", + " counts[num_2_class[num]] = int((double_clf == num).sum().values)\n", + "\n", + "class_counts_df = pd.DataFrame(list(counts.items()), columns=['Class', 'Count'])\n", + "class_counts_df['Percentage'] = (class_counts_df['Count'] / class_counts_df['Count'].sum())*100\n", + "ax = class_counts_df.plot.bar(x='Class', y='Percentage', rot=0, color='darkgreen', ylim=(0,100), title='Classified Areas per Classificator (%)')\n", + "\n", + "# Annotate the bars with the percentage values\n", + "for p in ax.patches:\n", + " ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), \n", + " ha='center', va='center', xytext=(0, 9), textcoords='offset points')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "In this chapter, we utilized machine learning to classify satellite imagery into forested and non-forested areas, comparing Naive Bayes and Random Forest classifiers. The Random Forest classifier generally outperformed Naive Bayes, with fewer errors in classification, although it misclassified the Danube River as forested, while Naive Bayes incorrectly identified cropland as forest. The analysis, supported by the bar chart, revealed that about 18% of the scene was classified as forest, 66% as non-forest, and the remainder included ambiguous categories. This comparison highlights the strengths and limitations of each classifier, underscoring the need for careful selection and evaluation of classification methods." + ] + } + ], + "metadata": { + "kernelspec": { + "name": "01_classification", + "language": "python", + "display_name": "01_classification" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from datetime import datetime, timedelta\n", - "\n", - "import xarray as xr\n", - "import pystac_client\n", - "import stackstac\n", - "import odc.stac\n", - "import rioxarray\n", - "import geopandas as gpd\n", - "from odc.geo.geobox import GeoBox\n", - "from dask.diagnostics import ProgressBar\n", - "from rasterio.enums import Resampling\n", - "from shapely.geometry import Polygon, mapping\n", - "\n", - "import cmcrameri as cmc\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "import pandas as pd\n", - "from pathlib import Path\n", - "\n", - "# Scikit Learn\n", - "from sklearn.naive_bayes import GaussianNB\n", - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "import matplotlib.colors as colors" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before we start, we need to load the data. We will use ``odc-stac`` to obtain data from Earth Search by Element 84. Here we define the area of interest and the time frame, aswell as the EPSG code and the resolution.\n", - "\n", - "### Searching in the Catalog\n", - "The module ``odc-stac`` provides access to free, open source satelite data. To retrieve the data, we must define several parameters that specify the location and time period for the satellite data. Additionally, we must specify the data collection we wish to access, as multiple collections are available. In this example, we will use multispectral imagery from the Sentinel-2 satellite." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dx = 0.0006 # 60m resolution\n", - "epsg = 4326\n", - "\n", - "# Set Spatial extent\n", - "latmin, latmax = 47.86, 48.407\n", - "lonmin, lonmax = 16.32, 16.9\n", - "bounds = (lonmin, latmin, lonmax, latmax)\n", - "minx, miny, maxx, maxy = bounds\n", - "geom = {\n", - " 'type': 'Polygon',\n", - " 'coordinates': [[\n", - " [minx, miny],\n", - " [minx, maxy],\n", - " [maxx, maxy],\n", - " [maxx, miny],\n", - " [minx, miny]\n", - " ]]\n", - "}\n", - "\n", - "# Set Temporal extent\n", - "year = 2024\n", - "month = 5\n", - "day = 1\n", - "delta = 10\n", - "\n", - "start_date = datetime(year, month, day)\n", - "end_date = start_date + timedelta(days=delta)\n", - "date_query = start_date.strftime(\"%Y-%m-%d\") + \"/\" + end_date.strftime(\"%Y-%m-%d\")\n", - "\n", - "# Search for Sentinel-2 data\n", - "items = pystac_client.Client.open(\n", - " \"https://earth-search.aws.element84.com/v1\"\n", - ").search(\n", - " intersects=geom,\n", - " collections=[\"sentinel-2-l2a\"],\n", - " datetime=date_query,\n", - " limit=100,\n", - ").item_collection()\n", - "print(len(items), 'scenes found')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will now focus on the area south-east of Vienna, where the Nationalpark _Donauauen_ is situated. The time frame we are interested in is the beginning of May 2024.\n", - "After passing these parameters to the `stac-catalog` we have found **10 scenes** that we can use for our analysis. \n", - "\n", - "### Loading the Data\n", - "Now we will load the data directly into an ``xarray`` dataset, which we can use to perform computations on the data. ``xarray`` is a powerful library for working with multi-dimensional arrays, making it well-suited for handling satellite data.\n", - "\n", - "Here's how we can load the data using odc-stac and xarray:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| output: false\n", - "# define a geobox for my region\n", - "geobox = GeoBox.from_bbox(bounds, crs=f\"epsg:{epsg}\", resolution=dx)\n", - "\n", - "# lazily combine items\n", - "ds_odc = odc.stac.load(\n", - " items,\n", - " bands=[\"scl\", \"red\", \"green\", \"blue\", \"nir\"],\n", - " chunks={'time': 5, 'x': 600, 'y': 600},\n", - " geobox=geobox,\n", - " resampling=\"bilinear\",\n", - ")\n", - "\n", - "# actually load it\n", - "ds_odc.load()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Visualization\n", - "### RGB Image\n", - "With the image data now in our possession, we can proceed with computations and visualizations.\n", - "\n", - "First, we define a mask to exclude cloud cover and areas with missing data. Subsequently, we create a composite median image, where each pixel value represents the median value across all the scenes we have identified. This approach helps to eliminate clouds and outliers present in some of the images, thereby providing a clearer and more representative visualization of the scene." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# define a mask for valid pixels (non-cloud)\n", - "def is_valid_pixel(data):\n", - " # include only vegetated, not_vegitated, water, and snow\n", - " return ((data > 3) & (data < 7)) | (data==11)\n", - "\n", - "ds_odc['valid'] = is_valid_pixel(ds_odc.scl)\n", - "#ds_odc.valid.sum(\"time\").plot()\n", - "\n", - "def avg(ds):\n", - " return (ds / ds.max() * 2)\n", - "\n", - "# compute the masked median\n", - "rgb_median = (\n", - " ds_odc[['red', 'green', 'blue']]\n", - " .where(ds_odc.valid)\n", - " .to_dataarray(dim=\"band\")\n", - " .transpose(..., \"band\")\n", - " .median(dim=\"time\")\n", - " .astype(int)\n", - ")\n", - "rgb_comp = avg(rgb_median)\n", - "plot = rgb_comp.plot.imshow(rgb=\"band\", figsize=(8, 8))\n", - "plot.axes.set_title(f\"RGB - Median Composite\\n{start_date.strftime('%d.%m.%Y')} - {end_date.strftime('%d.%m.%Y')}\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### False Color Image\n", - "In addition to the regular RGB Image, we can swap any of the bands from the visible spectrum with any other bands. In this specific case the red band has been changed to the near infrared band. This allows us to see vegetated areas more clearly, since they now appear in a bright red color. This is due to the fact that plants absorb regular red light while reflecting near infrared light [@nasa2020]." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute the false color image\n", - "fc_median = (\n", - " ds_odc[['nir', 'green', 'blue']]\n", - " .where(ds_odc.valid)\n", - " .to_dataarray(dim=\"band\")\n", - " .transpose(..., \"band\")\n", - " .median(dim=\"time\")\n", - " .astype(int)\n", - ")\n", - "fc_comp = avg(fc_median)\n", - "plot = fc_comp.plot.imshow(rgb=\"band\", figsize=(8, 8))\n", - "plot.axes.set_title(f\"False Color - Median Composite\\n{start_date.strftime('%d.%m.%Y')} - {end_date.strftime('%d.%m.%Y')}\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### NDVI Image\n", - "To get an first impression of the data, we can calculate the NDVI (Normalized Difference Vegetation Index) and plot it. The NDVI is calculated by useing the following formula. [@rouse1974monitoring]\n", - "\n", - "$$\n", - "NDVI = \\frac{NIR - Red}{NIR + Red}\n", - "$$\n", - "\n", - "This gives us a good overview of the vegetation in the area. The values can range from -1 to 1 where the following meanings are associated with these values:\n", - "\n", - "- -1 to 0 indicate dead plants or inanimate objects\n", - "- 0 to 0.33 are unhealthy plants\n", - "- 0.33 to 0.66 are moderatly healthy plants\n", - "- 0.66 to 1 are very healthy plants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Normalized Difference Vegetation Index (NDVI)\n", - "def normalized_difference(a, b):\n", - " return (a - b*1.) / (a + b)\n", - "\n", - "ndvi = normalized_difference(ds_odc.nir, ds_odc.red)\n", - "ndvi.median(dim=\"time\").plot.imshow(cmap='cmc.cork', vmin=-1, vmax=1).axes.set_title('NDVI')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Classification \n", - "In this chapter, we will classify the satellite data to identify forested areas within the scene. By using supervised machine learning techniques, we can train classifiers to distinguish between forested and non-forested regions based on the training data we provide. We will explore two different classifiers and compare their performance in accurately identifying forest areas.\n", - "\n", - "### Regions of Interest\n", - "Since this is a supervised classification, we need to have some training data. Therefore we need to define areas or regions, which we are certain represent the feature which we are classifiying. In this case we are interested in forested areas and regions that are definitly not forested. These regions will be used to train our classifiers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Define Polygons\n", - "forest_areas = {\n", - " 0: [Polygon([(16.482772, 47.901753), (16.465133, 47.870124), (16.510142, 47.874382), (16.482772, 47.901753)])],\n", - " 1: [Polygon([(16.594079, 47.938855), (16.581914, 47.894454), (16.620233, 47.910268), (16.594079, 47.938855)])],\n", - " 2: [Polygon([(16.67984, 47.978998), (16.637263, 47.971091), (16.660376, 47.929123), (16.67984, 47.978998)])],\n", - " 3: [Polygon([(16.756477, 48.000286), (16.723024, 47.983256), (16.739446, 47.972916), (16.756477, 48.000286)])],\n", - " 4: [Polygon([(16.80696, 48.135923), (16.780806, 48.125583), (16.798445, 48.115243), (16.80696, 48.135923)])],\n", - " 5: [Polygon([(16.684097, 48.144438), (16.664634, 48.124366), (16.690788, 48.118892), (16.684097, 48.144438)])],\n", - " 6: [Polygon([(16.550894, 48.169984), (16.530822, 48.165118), (16.558801, 48.137139), (16.550894, 48.169984)])],\n", - " 7: [Polygon([(16.588604, 48.402329), (16.556976, 48.401112), (16.580697, 48.382865), (16.588604, 48.402329)])],\n", - "}\n", - "\n", - "nonforest_areas = {\n", - " 0: [Polygon([(16.674974, 48.269126), (16.623882, 48.236281), (16.682272, 48.213168), (16.674974, 48.269126)])],\n", - " 1: [Polygon([(16.375723, 48.228374), (16.357476, 48.188839), (16.399444, 48.185798), (16.375723, 48.228374)])],\n", - " 2: [Polygon([(16.457834, 48.26426), (16.418907, 48.267301), (16.440804, 48.23324), (16.457834, 48.26426)])],\n", - " 3: [Polygon([(16.519266, 48.101861), (16.470607, 48.100645), (16.500411, 48.07145), (16.519266, 48.101861)])],\n", - " 4: [Polygon([(16.453577, 48.051986), (16.412217, 48.067192), (16.425598, 48.012451), (16.453577, 48.051986)])],\n", - "}\n", - "\n", - "# Geoppandas Dataframe from Polygons\n", - "forest_df = gpd.GeoDataFrame({'geometry': [poly[0] for poly in forest_areas.values()]}, crs=\"EPSG:4326\")\n", - "nonforest_df = gpd.GeoDataFrame({'geometry': [poly[0] for poly in nonforest_areas.values()]}, crs=\"EPSG:4326\")\n", - "\n", - "\n", - "# Plotting Regions of Interest\n", - "fig, ax = plt.subplots()\n", - "rgb_comp.plot.imshow(ax=ax)\n", - "forest_df.plot(ax=ax, ec='C0', fc='none')\n", - "nonforest_df.plot(ax=ax, ec='C1', fc='none')\n", - "ax.set_title('Regions of Interest')\n", - "ax.set_aspect('equal')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Preparation\n", - "In addition to the Regions of Interest we will extract the specific bands from the loaded dataset that we intend to use for the classification, which are the `red, green, blue` and `near-infrared` bands, although other bands can also be utilized. Using these bands, we will create both a training and a testing dataset. The training dataset will be used to train the classifier, while the testing dataset will be employed to evaluate its performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Classifiying dataset (only necessary bands)\n", - "bands = ['red', 'green', 'blue', 'nir']\n", - "ds_class = (\n", - " ds_odc[bands]\n", - " .where(ds_odc.valid)\n", - " .median(dim=\"time\")\n", - ")\n", - "ds_class = avg(ds_class)\n", - "ds_class = ds_class.fillna(0)\n", - "\n", - "def clip_array(ds:xr.Dataset, polygons):\n", - " clipped = ds.rio.clip(polygons, invert=False, all_touched=False, drop=True)\n", - " clipped_nan = clipped.where(clipped == ds)\n", - " return clipped_nan\n", - "\n", - "# Dictionaries with Dataarrays, each clipped by a Polygon\n", - "data_dict_feat = {idx: clip_array(ds_class, polygon) for idx, polygon in forest_areas.items()}\n", - "data_dict_nonfeat = {idx: clip_array(ds_class, polygon) for idx, polygon in nonforest_areas.items()}\n", - "\n", - "# Reshape the polygon dataarrays to get a tuple (one value per band) of pixel values\n", - "feat_data = [xarray.to_array().values.reshape(len(bands),-1).T for xarray in data_dict_feat.values()] # replaced median_data_dict_feat with data_dict_feat\n", - "nonfeat_data = [xarray.to_array().values.reshape(len(bands),-1).T for xarray in data_dict_nonfeat.values()] # replaced median_data_dict_feat with data_dict_feat\n", - "\n", - "# The rows of the different polygons are concatenated to a single array for further processing\n", - "feat_values = np.concatenate(feat_data)\n", - "nonfeat_values = np.concatenate(nonfeat_data)\n", - "\n", - "# Drop Nan Values\n", - "X_feat_data = feat_values[~np.isnan(feat_values).any(axis=1)]\n", - "X_nonfeat_data = nonfeat_values[~np.isnan(nonfeat_values).any(axis=1)]\n", - "\n", - "# Creating Output Vector (1 for pixel is features; 0 for pixel is not feature)\n", - "y_feat_data = np.ones(X_feat_data.shape[0])\n", - "y_nonfeat_data = np.zeros(X_nonfeat_data.shape[0])\n", - "\n", - "# Concatnate all Classes for training \n", - "X = np.concatenate([X_feat_data, X_nonfeat_data])\n", - "y = np.concatenate([y_feat_data, y_nonfeat_data])\n", - "\n", - "# Split into Training and Testing Data.\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have prepared the training and testing data, we will create an image array of the actual scene that we intend to classify. This array will serve as the input for our classification algorithms, allowing us to apply the trained classifiers to the entire scene and identify the forested and non-forested areas accurately." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "image_data = ds_class[bands].to_array(dim='band').transpose('latitude', 'longitude', 'band')\n", - "\n", - "# Reshape the image data\n", - "num_of_pixels = ds_class.sizes['longitude'] * ds_class.sizes['latitude']\n", - "num_of_bands = len(bands)\n", - "X_image_data = image_data.values.reshape(num_of_pixels, num_of_bands)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Classifiying with Naive Bayes\n", - "Now that we have prepared all the needed data, we can begin the actual classification process.\n", - "\n", - "We will start with a _Naive Bayes_ classifier. First, we will train the classifier using our training dataset. Once trained, we will apply the classifier to the actual image to identify the forested and non-forested areas.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Naive Bayes initialization and training\n", - "nb = GaussianNB()\n", - "nb_test = nb.fit(X_train, y_train)\n", - "nb_predict = nb.predict(X_test)\n", - "\n", - "# Prediction on image\n", - "nb_predict_img = nb.predict(X_image_data)\n", - "nb_predict_img = nb_predict_img.reshape(ds_class.sizes['latitude'], ds_class.sizes['longitude'])\n", - "\n", - "# Adding the Naive Bayes Prediction to the dataset\n", - "ds_class['NB-forest'] = xr.DataArray(nb_predict_img, dims=['latitude', 'longitude'], coords={'longitude': ds_class['longitude'], 'latitude': ds_class['latitude']})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To evaluate the effectiveness of the classification, we will plot the image predicted by the classifier. Additionally, we will examine the ``Classification Report`` and the ``Confusion Matrix`` to gain further insights into the classifier's performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plot Naive Bayes\n", - "alpha = 1\t\n", - "cmap_green = colors.ListedColormap([(1, 1, 1, alpha), 'green'])\n", - "\n", - "plot = ds_class['NB-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", - "cbar = plot.colorbar\n", - "cbar.set_ticklabels(['non-forest', 'forest'])\n", - "plot.axes.set_title('Naive Bayes Classification')\n", - "plt.show()\n", - "\n", - "# Print the Classification report\n", - "print(\"NAIVE BAYES: \\n \"+ classification_report(y_test, nb_predict))\n", - "\n", - "# Print the confusion matrix\n", - "con_mat_nb = pd.DataFrame(confusion_matrix(y_test, nb_predict), \n", - " index=['Actual Negative', 'Actual Positive'], \n", - " columns=['Predicted Negative', 'Predicted Positive'])\n", - "display(con_mat_nb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Classifiying with Random Forest\n", - "To ensure our results are robust, we will explore an additional classifier. In this section, we will use the Random Forest classifier. The procedure for using this classifier is the same as before: we will train the classifier using our training dataset and then apply it to the actual image to classify the scene." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Random Forest initialization and training\n", - "rf = RandomForestClassifier(n_estimators=100)\n", - "rf_test = rf.fit(X_train, y_train)\n", - "rf_predict = rf.predict(X_test)\n", - "\n", - "# Prediction on image\n", - "rf_predict_img = rf.predict(X_image_data)\n", - "rf_predict_img = rf_predict_img.reshape(ds_class.sizes['latitude'], ds_class.sizes['longitude'])\n", - "\n", - "# Adding the Random Forest Prediction to the dataset\n", - "ds_class['RF-forest'] = xr.DataArray(rf_predict_img, dims=['latitude', 'longitude'], coords={'longitude': ds_class['longitude'], 'latitude': ds_class['latitude']})\n", - "\n", - "plot = ds_class['RF-forest'].plot.imshow(cmap=cmap_green, cbar_kwargs={'ticks': [0.25,0.75]})\n", - "cbar = plot.colorbar\n", - "cbar.set_ticklabels(['non-forest', 'forest'])\n", - "plot.axes.set_title('Random Forest Classification')\n", - "plt.show()\n", - "\n", - "# Print the Classification report\n", - "print(\"RANDOM FOREST: \\n \"+ classification_report(y_test, rf_predict))\n", - "\n", - "# Print the confusion matrix\n", - "con_mat_rf = pd.DataFrame(confusion_matrix(y_test, rf_predict), \n", - " index=['Actual Negative', 'Actual Positive'], \n", - " columns=['Predicted Negative', 'Predicted Positive'])\n", - "display(con_mat_rf)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can already see from the `classification reports` and the `confusion matrices` that the Random Forest classifier has outperformed the Naive Bayes classifier. This is particularly evident from the lower values in the secondary diagonal, indicating minimal False Positives and False Negatives. It appears that the Naive Bayes classifier is more sensitive to False Positives, resulting in a higher rate of incorrect classifications.\n", - "\n", - "### Comparison of the Classificators\n", - "\n", - "To gain a more in-depth understanding of the classifiers' performance, we will compare their results. Specifically, we will identify the areas where both classifiers agree and the areas where they disagree. This comparison will provide valuable insights into the strengths and weaknesses of each classifier, allowing us to better assess their effectiveness in identifying forested and non-forested regions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| code-fold: true\n", - "cmap_trio = colors.ListedColormap(['whitesmoke' ,'indianred', 'goldenrod', 'darkgreen'])\n", - "\n", - "\n", - "double_clf = (ds_class['NB-forest'] + 2*ds_class['RF-forest'])\n", - "\n", - "fig, ax = plt.subplots()\n", - "cax = ax.imshow(double_clf, cmap=cmap_trio, interpolation='none')\n", - "\n", - "# Add a colorbar with custom tick labels\n", - "cbar = fig.colorbar(cax, ticks=[1*0.375, 3*0.375, 5*0.375, 7*0.375])\n", - "cbar.ax.set_yticklabels(['None', 'Naive Bayes', 'Random Forest', 'Both'])\n", - "ax.set_title('Classification Comparisson')\n", - "ax.set_axis_off()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The areas where both classifiers agree include the larger forested regions, such as the _Nationalpark Donau-Auen_ and the _Leithagebirge_. Additionally, both classifiers accurately identified the urban areas of Vienna and correctly excluded them from being classified as forested." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| code-fold: true\n", - "# Plot only one class, either None (0), Naive Bayes (1), Random Forest (2), or Both (3)\n", - "fig, axs = plt.subplots(2,2, figsize=(8,8))\n", - "ax = axs.ravel()\n", - "\n", - "for i in range(4):\n", - " ax[i].imshow(double_clf==i, cmap='cmc.oleron_r', interpolation='none')\n", - " category = ['by None', 'only by Naive Bayes', 'only by Random Forest', 'by Both'][i]\n", - " title = 'Areas classified ' + category\n", - " ax[i].set_title(title)\n", - " ax[i].set_axis_off()\n", - "\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When plotting the classified areas individually, we observe that the Random Forest classifier mistakenly identified the Danube River as a forested area. Conversely, the Naive Bayes classifier erroneously classified a significant amount of cropland as forest.\n", - "\n", - "Finally, by analyzing the proportion of forested areas within the scene, we find that approximately 18% of the area is classified as forest, while around 66% is classified as non-forest. The remaining areas, which include water bodies and cropland, fall into less clearly defined categories.\n", - "\n", - "The accompanying bar chart illustrates the distribution of these classifications, highlighting the percentage of forested areas, non-forested areas, and regions classified by only one of the two classifiers. This visual representation helps to quantify the areas of agreement and disagreement between the classifiers, providing a clearer picture of their performance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| code-fold: true\n", - "counts = {}\n", - "for num in range(0,4):\n", - " num_2_class = {0: 'None', 1: 'Naive Bayes', 2: 'Random Forest', 3: 'Both'}\n", - " counts[num_2_class[num]] = int((double_clf == num).sum().values)\n", - "\n", - "class_counts_df = pd.DataFrame(list(counts.items()), columns=['Class', 'Count'])\n", - "class_counts_df['Percentage'] = (class_counts_df['Count'] / class_counts_df['Count'].sum())*100\n", - "ax = class_counts_df.plot.bar(x='Class', y='Percentage', rot=0, color='darkgreen', ylim=(0,100), title='Classified Areas per Classificator (%)')\n", - "\n", - "# Annotate the bars with the percentage values\n", - "for p in ax.patches:\n", - " ax.annotate(f'{p.get_height():.1f}%', (p.get_x() + p.get_width() / 2., p.get_height()), \n", - " ha='center', va='center', xytext=(0, 9), textcoords='offset points')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "In this chapter, we utilized machine learning to classify satellite imagery into forested and non-forested areas, comparing Naive Bayes and Random Forest classifiers. The Random Forest classifier generally outperformed Naive Bayes, with fewer errors in classification, although it misclassified the Danube River as forested, while Naive Bayes incorrectly identified cropland as forest. The analysis, supported by the bar chart, revealed that about 18% of the scene was classified as forest, 66% as non-forest, and the remainder included ambiguous categories. This comparison highlights the strengths and limitations of each classifier, underscoring the need for careful selection and evaluation of classification methods." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "01_classification", - "language": "python", - "name": "01_classification" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/notebooks/02_floodmapping.ipynb b/notebooks/02_floodmapping.ipynb index 5fe3a33..6b99535 100644 --- a/notebooks/02_floodmapping.ipynb +++ b/notebooks/02_floodmapping.ipynb @@ -1,341 +1,354 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reverend Bayes updates our Belief in Flood Detection\n", - "How an 275 year old idea helps map the extent of floods\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: Reverend Bayes updates our Belief in Flood Detection\n", + "subtitle: How an 275 year old idea helps map the extent of floods\n", + "jupyter: \n", + " kernelspec:\n", + " name: \"02_floodmapping\"\n", + " language: \"python\"\n", + " display_name: \"02_floodmapping\"\n", + "---\n", + "\n", + "![Image from [wikipedia](https://upload.wikimedia.org/wikipedia/commons/d/d4/Thomas_Bayes.gif)](https://upload.wikimedia.org/wikipedia/commons/d/d4/Thomas_Bayes.gif)\n", + "\n", + "::: {.callout-note}\n", + "This notebooks contains interactive element. These interactive element can only be viewed on Binder by clicking on the Binder badge or 🚀 button.\n", + ":::\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "%matplotlib widget\n", + "\n", + "import numpy as np\n", + "import xarray as xr\n", + "import numpy as np\n", + "import datetime\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from scipy.stats import norm\n", + "from eomaps import Maps" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "sig0_dc = xr.open_dataset('../data/s1_parameters/S1_CSAR_IWGRDH/SIG0/V1M1R1/EQUI7_EU020M/E054N006T3/SIG0_20180228T043908__VV_D080_E054N006T3_EU020M_V1M1R1_S1AIWGRDH_TUWIEN.nc')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## From Backscattering to Flood Mapping\n", + "\n", + "This notebook explains how microwave ($\\sigma^0$) backscattering (@fig-area) can be used to map the extent of a flood. We replicate in this exercise the work of @bauer-marschallinger_satellite-based_2022 on the TU Wien Bayesian-based flood mapping algorithm.\n", + "\n", + "In the following lines we create a map with EOmaps [@quast_getting_2024] of the $\\sigma^0$ backscattering values." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| label: fig-area\n", + "#| fig-cap: 'Area targeted for $\\sigma^0$ backscattering is the Greek region of Thessaly, which experienced a major flood in February of 2018.'\n", + "m = Maps(ax=121, crs=3857)\n", + "m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"SIG0\", crs=Maps.CRS.Equi7_EU)\n", + "m.plot_map()\n", + "m.add_colorbar(label=\"$\\sigma^0$ (dB)\", orientation=\"vertical\", hist_bins=30)\n", + "m.add_scalebar(n=5)\n", + "m2 = m.new_map(ax=122, crs=3857)\n", + "m2.set_extent(m.get_extent())\n", + "m2.add_wms.OpenStreetMap.add_layer.default()\n", + "m.apply_layout(\n", + " {\n", + " 'figsize': [7.32, 4.59],\n", + " '0_map': [0.05, 0.18, 0.35, 0.64],\n", + " '1_cb': [0.8125, 0.1, 0.1, 0.8],\n", + " '1_cb_histogram_size': 0.8,\n", + " '2_map': [0.4375, 0.18, 0.35, 0.64]\n", + " }\n", + " )\n", + "m.show()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Microwave Backscattering over Land and Water\n", + "\n", + "Reverend Bayes was concerned with two events, one (the *hypothesis*) occurring before the other (the *evidence*). If we know its cause, it is easy to logically deduce the probability of an effect. However, in this case we want to deduce the probability of a cause from an observed effect, also known as \"reversed probability\". In the case of flood mapping, we have $\\sigma^0$ backscatter observations over land (the effect) and we want to deduce the probability of flooding ($F$) and non-flooding ($NF$). \n", + "\n", + "In other words, we want to know the probability of flooding $P(F)$ given a pixel's $\\sigma^0$:\n", + "\n", + "$$P(F|\\sigma^0)$$\n", + "\n", + "and the probability of a pixel being not flooded $P(NF)$ given a certain $\\sigma^0$:\n", + "\n", + "$$P(NF|\\sigma^0).$$\n", + "\n", + "Bayes showed that these can be deduced from the observation that forward and reversed probability are equal, so that:\n", + "\n", + "$$P(F|\\sigma^0)P(\\sigma^0) = P(\\sigma^0|F)P(F)$$\n", + "\n", + "and\n", + "\n", + "$$P(NF|\\sigma^0)P(\\sigma^0) = P(\\sigma^0|NF)P(NF).$$\n", + "\n", + "\n", + "The forward probability of $\\sigma^0$ given the occurrence of flooding ($P(\\sigma^0|F)$) and $\\sigma^0$ given no flooding ($P(\\sigma^0|NF)$) can be extracted from past information on backscattering over land and water surfaces. As seen in the sketch below (@fig-sat), the characteristics of backscattering over land and water differ considerably.\n", + "\n", + "![Schematic backscattering over land and water. Image from [Geological Survey Ireland](https://www.gsi.ie/images/images/SAR_mapping_land_water.jpg)](https://www.gsi.ie/images/images/SAR_mapping_land_water.jpg){#fig-sat}\n", + "\n", + "## Likelihoods\n", + "\n", + "The so-called likelihoods of $P(\\sigma^0|F)$ and $P(\\sigma^0|NF)$ can thus be calculated from past backscattering information. Without going into the details of how these likelihoods are calculated, you can **click** on a pixel of the map to plot the likelihoods of $\\sigma^0$ being governed by land or water." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| label: fig-lik\n", + "#| fig-cap: 'Likelihoods for $\\sigma^0$ being associated with land or water for 1 pixel in the Greek area of Thessaly. Likelihoods are calculated over a range of $\\sigma^0$. The pixel''s observed $\\sigma^0$ is given with a vertical line. Click on the map to re-calculate and update this figure for another pixel in the study area. Map created with EOmaps [@quast_getting_2024].'\n", + "\n", + "RANGE = np.arange(-30, 0, 0.1)\n", + "hparam_dc = xr.open_dataset('../data/tuw_s1_harpar/S1_CSAR_IWGRDH/SIG0-HPAR/V0M2R3/EQUI7_EU020M/E054N006T3/D080.nc')\n", + "plia_dc = xr.open_dataset('../data/s1_parameters/S1_CSAR_IWGRDH/PLIA-TAG/V01R03/EQUI7_EU020M/E054N006T3/PLIA-TAG-MEAN_20200101T000000_20201231T235959__D080_E054N006T3_EU020M_V01R03_S1IWGRDH.nc')\n", + "sig0_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", + "hparam_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", + "plia_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", + "\n", + "def calc_water_likelihood(id, x):\n", + " point = plia_dc.where(plia_dc.id == id, drop=True)\n", + " wbsc_mean = point.PLIA * -0.394181 + -4.142015\n", + " wbsc_std = 2.754041\n", + " return norm.pdf(x, wbsc_mean.to_numpy(), wbsc_std).flatten()\n", + "\n", + "def expected_land_backscatter(data, dtime_str):\n", + " w = np.pi * 2 / 365\n", + " dt = datetime.datetime.strptime(dtime_str, \"%Y-%m-%d\")\n", + " t = dt.timetuple().tm_yday\n", + " wt = w * t\n", + "\n", + " M0 = data.M0\n", + " S1 = data.S1\n", + " S2 = data.S2\n", + " S3 = data.S3\n", + " C1 = data.C1\n", + " C2 = data.C2\n", + " C3 = data.C3\n", + " hm_c1 = (M0 + S1 * np.sin(wt)) + (C1 * np.cos(wt))\n", + " hm_c2 = ((hm_c1 + S2 * np.sin(2 * wt)) + C2 * np.cos(2 * wt))\n", + " hm_c3 = ((hm_c2 + S3 * np.sin(3 * wt)) + C3 * np.cos(3 * wt))\n", + " return hm_c3\n", + "\n", + "def calc_land_likelihood(id, x):\n", + " point = hparam_dc.where(hparam_dc.id == id, drop=True)\n", + " lbsc_mean = expected_land_backscatter(point, '2018-02-01')\n", + " lbsc_std = point.STD\n", + " return norm.pdf(x, lbsc_mean.to_numpy(), lbsc_std.to_numpy()).flatten()\n", + "\n", + "def calc_likelihoods(id, x):\n", + " if isinstance(x, list):\n", + " x = np.arange(x[0], x[1], 0.1)\n", + " water_likelihood, land_likelihood = calc_water_likelihood(id=id, x=x), calc_land_likelihood(id=id, x=x)\n", + " return water_likelihood, land_likelihood\n", + "\n", + "def view_bayes_flood(sig0_dc, calc_posteriors=None, bayesian_flood_decision=None):\n", + "\n", + " # initialize a map on top\n", + " m = Maps(ax=122, layer=\"data\", crs=Maps.CRS.Equi7_EU)\n", + "\n", + " # initialize 2 matplotlib plot-axes next to the map\n", + " ax_upper = m.f.add_subplot(221)\n", + " ax_upper.set_ylabel(\"likelihood\")\n", + " ax_upper.set_xlabel(\"$\\sigma^0 (dB)$\")\n", + "\n", + " ax_lower = m.f.add_subplot(223)\n", + " ax_lower.set_ylabel(\"probability\")\n", + " ax_lower.set_xlabel(\"$\\sigma^0 (dB)$\")\n", + "\n", + " # -------- assign data to the map and plot it\n", + " if bayesian_flood_decision is not None:\n", + " # add map\n", + " m2 = m.new_layer(layer=\"map\")\n", + " m2.add_wms.OpenStreetMap.add_layer.default()\n", + " flood_classification = bayesian_flood_decision(sig0_dc.id, sig0_dc.SIG0)\n", + " sig0_dc[\"decision\"] = (('y', 'x'), flood_classification.reshape(sig0_dc.SIG0.shape))\n", + " sig0_dc[\"decision\"] = sig0_dc.decision.where(sig0_dc.SIG0.notnull())\n", + " sig0_dc[\"decision\"] = sig0_dc.decision.where(sig0_dc.decision==0)\n", + " m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"decision\", crs=Maps.CRS.Equi7_EU)\n", + " m.plot_map()\n", + " m.show_layer(\"map\", (\"data\", 0.5))\n", + " m.apply_layout(\n", + " {\n", + " 'figsize': [7.32, 4.59],\n", + " '0_map': [0.44573, 0.11961, 0.3375, 0.75237],\n", + " '1_': [0.10625, 0.5781, 0.3125, 0.29902],\n", + " '2_': [0.10625, 0.11961, 0.3125, 0.29902],\n", + " }\n", + " )\n", + "\n", + " else:\n", + " m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"SIG0\", crs=Maps.CRS.Equi7_EU)\n", + " m.plot_map()\n", + " m.add_colorbar(label=\"$\\sigma^0$ (dB)\", orientation=\"vertical\", hist_bins=30)\n", + " m.apply_layout(\n", + " {\n", + " 'figsize': [7.32, 4.59],\n", + " '0_map': [0.44573, 0.11961, 0.3375, 0.75237],\n", + " '1_': [0.10625, 0.5781, 0.3125, 0.29902],\n", + " '2_': [0.10625, 0.11961, 0.3125, 0.29902],\n", + " '3_cb': [0.8, 0.09034, 0.1, 0.85],\n", + " '3_cb_histogram_size': 0.8\n", + " }\n", + " )\n", + "\n", + " def update_plots(ID, **kwargs):\n", + " \n", + " # get the data\n", + " value = sig0_dc.where(sig0_dc.id == ID, drop=True).SIG0.to_numpy()\n", + " y1_pdf, y2_pdf = calc_water_likelihood(ID, RANGE), calc_land_likelihood(ID, RANGE)\n", + "\n", + " # plot the lines and vline\n", + " (water,) = ax_upper.plot(RANGE, y1_pdf, 'k-', lw=2, label=\"water\")\n", + " (land,) = ax_upper.plot(RANGE, y2_pdf,'r-', lw=5, alpha=0.6, label=\"land\")\n", + " value_left = ax_upper.vlines(x=value, ymin=0, ymax=np.max((y1_pdf, y2_pdf)), lw=3, label=\"observed\")\n", + " ax_upper.legend(loc=\"upper left\")\n", + "\n", + " # add all artists as \"temporary pick artists\" so that they\n", + " # are removed when the next datapoint is selected\n", + " for a in [water, land, value_left]:\n", + " m.cb.pick.add_temporary_artist(a)\n", + "\n", + " if calc_posteriors is not None:\n", + " f_post, nf_post = calc_posteriors(y1_pdf, y2_pdf)\n", + " (f,) = ax_lower.plot(RANGE, f_post, 'k-', lw=2, label=\"flood\")\n", + " (nf,) = ax_lower.plot(RANGE, nf_post,'r-', lw=5, alpha=0.6, label=\"non-flood\")\n", + " value_right = ax_lower.vlines(x=value, ymin=-0.1, ymax=1.1, lw=3, label=\"observed\")\n", + " ax_lower.legend(loc=\"upper left\")\n", + " for a in [f, nf, value_right]:\n", + " m.cb.pick.add_temporary_artist(a)\n", + "\n", + " # re-compute axis limits based on the new artists\n", + " ax_upper.relim()\n", + " ax_upper.autoscale()\n", + "\n", + " m.cb.pick.attach(update_plots)\n", + " m.cb.pick.attach.mark(permanent=False, buffer=1, fc=\"none\", ec=\"r\")\n", + " m.cb.pick.attach.mark(permanent=False, buffer=2, fc=\"none\", ec=\"r\", ls=\":\")\n", + " m.show()\n", + "\n", + "view_bayes_flood(sig0_dc)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Posteriors\n", + "\n", + "Having calculated the likelihoods, we can now move on to calculate the probability of (non-)flooding given a pixel's $\\sigma^0$. These so-called *posteriors* need one more piece of information, as can be seen in the equation above. We need the probability that a pixel is flooded $P(F)$ or not flooded $P(NF)$. Of course, these are the figures we've been trying to find this whole time. We don't actually have them yet, so what can we do? In Bayesian statistics, we can just start with our best guess. These guesses are called our \"priors\", because they are the beliefs we hold *prior* to looking at the data. This subjective prior belief is the foundation Bayesian statistics, and we use the likelihoods we just calculated to update our belief in this particular hypothesis. This updated belief is called the \"posterior\".\n", + "\n", + "Let's say that our best estimate for the chance of flooding versus non-flooding of a pixel is 50-50: a coin flip. We now can also calculate the probability of backscattering $P(\\sigma^0)$, as the weighted average of the water and land likelihoods, ensuring that our posteriors range between 0 to 1.\n", + "\n", + "The following code block shows how we calculate the priors." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def calc_posteriors(water_likelihood, land_likelihood):\n", + " evidence = (water_likelihood * 0.5) + (land_likelihood * 0.5)\n", + " return (water_likelihood * 0.5) / evidence, (land_likelihood * 0.5) / evidence" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can plot the posterior probabilities of flooding and non-flooding again and compare these to pixel's measured $\\sigma^0$. **Click** on a pixel to calculate the posterior probability." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| label: fig-post\n", + "#| fig-cap: 'Posterior probabilities for $\\sigma^0$ of 1 pixel being associated with land for water in the Greek area of Thessaly. Click on the map to re-calculate and update this figure for another pixel in the study area. Map created with EOmaps [@quast_getting_2024].'\n", + "view_bayes_flood(sig0_dc, calc_posteriors)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Flood Classification\n", + "\n", + "We are now ready to combine all this information and classify the pixels according to the probability of flooding given the backscatter value of each pixel. Here we just look whether the probability of flooding is higher than non-flooding:" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "def bayesian_flood_decision(id, sig0_dc):\n", + " nf_post_prob, f_post_prob = calc_posteriors(*calc_likelihoods(id, sig0_dc))\n", + " return np.greater(f_post_prob, nf_post_prob)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Click** on a point in the below map to see the likelihoods and posterior distributions (in the left-hand subplots)." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "#| label: fig-clas\n", + "#| fig-cap: 'Flood extent of the Greek region of Thessaly based on Bayesian probabilities are shown on the map superimposed on an open street map. Click on a pixel to generate the point''s water and land likelihoods as well as the posterior probabilities. Map created with EOmaps [@quast_getting_2024].'\n", + "view_bayes_flood(sig0_dc, calc_posteriors, bayesian_flood_decision)" + ], + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "name": "02_floodmapping", + "language": "python", + "display_name": "02_floodmapping" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib widget\n", - "\n", - "import numpy as np\n", - "import xarray as xr\n", - "import numpy as np\n", - "import datetime\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from scipy.stats import norm\n", - "from eomaps import Maps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sig0_dc = xr.open_dataset('../data/s1_parameters/S1_CSAR_IWGRDH/SIG0/V1M1R1/EQUI7_EU020M/E054N006T3/SIG0_20180228T043908__VV_D080_E054N006T3_EU020M_V1M1R1_S1AIWGRDH_TUWIEN.nc')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## From Backscattering to Flood Mapping\n", - "\n", - "This notebook explains how microwave ($\\sigma^0$) backscattering (@fig-area) can be used to map the extent of a flood. We replicate in this exercise the work of @bauer-marschallinger_satellite-based_2022 on the TU Wien Bayesian-based flood mapping algorithm.\n", - "\n", - "In the following lines we create a map with EOmaps [@quast_getting_2024] of the $\\sigma^0$ backscattering values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| label: fig-area\n", - "#| fig-cap: 'Area targeted for $\\sigma^0$ backscattering is the Greek region of Thessaly, which experienced a major flood in February of 2018.'\n", - "m = Maps(ax=121, crs=3857)\n", - "m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"SIG0\", crs=Maps.CRS.Equi7_EU)\n", - "m.plot_map()\n", - "m.add_colorbar(label=\"$\\sigma^0$ (dB)\", orientation=\"vertical\", hist_bins=30)\n", - "m.add_scalebar(n=5)\n", - "m2 = m.new_map(ax=122, crs=3857)\n", - "m2.set_extent(m.get_extent())\n", - "m2.add_wms.OpenStreetMap.add_layer.default()\n", - "m.apply_layout(\n", - " {\n", - " 'figsize': [7.32, 4.59],\n", - " '0_map': [0.05, 0.18, 0.35, 0.64],\n", - " '1_cb': [0.8125, 0.1, 0.1, 0.8],\n", - " '1_cb_histogram_size': 0.8,\n", - " '2_map': [0.4375, 0.18, 0.35, 0.64]\n", - " }\n", - " )\n", - "m.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Microwave Backscattering over Land and Water\n", - "\n", - "Reverend Bayes was concerned with two events, one (the *hypothesis*) occurring before the other (the *evidence*). If we know its cause, it is easy to logically deduce the probability of an effect. However, in this case we want to deduce the probability of a cause from an observed effect, also known as \"reversed probability\". In the case of flood mapping, we have $\\sigma^0$ backscatter observations over land (the effect) and we want to deduce the probability of flooding ($F$) and non-flooding ($NF$). \n", - "\n", - "In other words, we want to know the probability of flooding $P(F)$ given a pixel's $\\sigma^0$:\n", - "\n", - "$$P(F|\\sigma^0)$$\n", - "\n", - "and the probability of a pixel being not flooded $P(NF)$ given a certain $\\sigma^0$:\n", - "\n", - "$$P(NF|\\sigma^0).$$\n", - "\n", - "Bayes showed that these can be deduced from the observation that forward and reversed probability are equal, so that:\n", - "\n", - "$$P(F|\\sigma^0)P(\\sigma^0) = P(\\sigma^0|F)P(F)$$\n", - "\n", - "and\n", - "\n", - "$$P(NF|\\sigma^0)P(\\sigma^0) = P(\\sigma^0|NF)P(NF).$$\n", - "\n", - "\n", - "The forward probability of $\\sigma^0$ given the occurrence of flooding ($P(\\sigma^0|F)$) and $\\sigma^0$ given no flooding ($P(\\sigma^0|NF)$) can be extracted from past information on backscattering over land and water surfaces. As seen in the sketch below (@fig-sat), the characteristics of backscattering over land and water differ considerably.\n", - "\n", - "![Schematic backscattering over land and water. Image from [Geological Survey Ireland](https://www.gsi.ie/images/images/SAR_mapping_land_water.jpg)](https://www.gsi.ie/images/images/SAR_mapping_land_water.jpg){#fig-sat}\n", - "\n", - "## Likelihoods\n", - "\n", - "The so-called likelihoods of $P(\\sigma^0|F)$ and $P(\\sigma^0|NF)$ can thus be calculated from past backscattering information. Without going into the details of how these likelihoods are calculated, you can **click** on a pixel of the map to plot the likelihoods of $\\sigma^0$ being governed by land or water." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| label: fig-lik\n", - "#| fig-cap: 'Likelihoods for $\\sigma^0$ being associated with land or water for 1 pixel in the Greek area of Thessaly. Likelihoods are calculated over a range of $\\sigma^0$. The pixel''s observed $\\sigma^0$ is given with a vertical line. Click on the map to re-calculate and update this figure for another pixel in the study area. Map created with EOmaps [@quast_getting_2024].'\n", - "\n", - "RANGE = np.arange(-30, 0, 0.1)\n", - "hparam_dc = xr.open_dataset('../data/tuw_s1_harpar/S1_CSAR_IWGRDH/SIG0-HPAR/V0M2R3/EQUI7_EU020M/E054N006T3/D080.nc')\n", - "plia_dc = xr.open_dataset('../data/s1_parameters/S1_CSAR_IWGRDH/PLIA-TAG/V01R03/EQUI7_EU020M/E054N006T3/PLIA-TAG-MEAN_20200101T000000_20201231T235959__D080_E054N006T3_EU020M_V01R03_S1IWGRDH.nc')\n", - "sig0_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", - "hparam_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", - "plia_dc['id'] = (('y', 'x'), np.arange(sig0_dc.SIG0.size).reshape(sig0_dc.SIG0.shape))\n", - "\n", - "def calc_water_likelihood(id, x):\n", - " point = plia_dc.where(plia_dc.id == id, drop=True)\n", - " wbsc_mean = point.PLIA * -0.394181 + -4.142015\n", - " wbsc_std = 2.754041\n", - " return norm.pdf(x, wbsc_mean.to_numpy(), wbsc_std).flatten()\n", - "\n", - "def expected_land_backscatter(data, dtime_str):\n", - " w = np.pi * 2 / 365\n", - " dt = datetime.datetime.strptime(dtime_str, \"%Y-%m-%d\")\n", - " t = dt.timetuple().tm_yday\n", - " wt = w * t\n", - "\n", - " M0 = data.M0\n", - " S1 = data.S1\n", - " S2 = data.S2\n", - " S3 = data.S3\n", - " C1 = data.C1\n", - " C2 = data.C2\n", - " C3 = data.C3\n", - " hm_c1 = (M0 + S1 * np.sin(wt)) + (C1 * np.cos(wt))\n", - " hm_c2 = ((hm_c1 + S2 * np.sin(2 * wt)) + C2 * np.cos(2 * wt))\n", - " hm_c3 = ((hm_c2 + S3 * np.sin(3 * wt)) + C3 * np.cos(3 * wt))\n", - " return hm_c3\n", - "\n", - "def calc_land_likelihood(id, x):\n", - " point = hparam_dc.where(hparam_dc.id == id, drop=True)\n", - " lbsc_mean = expected_land_backscatter(point, '2018-02-01')\n", - " lbsc_std = point.STD\n", - " return norm.pdf(x, lbsc_mean.to_numpy(), lbsc_std.to_numpy()).flatten()\n", - "\n", - "def calc_likelihoods(id, x):\n", - " if isinstance(x, list):\n", - " x = np.arange(x[0], x[1], 0.1)\n", - " water_likelihood, land_likelihood = calc_water_likelihood(id=id, x=x), calc_land_likelihood(id=id, x=x)\n", - " return water_likelihood, land_likelihood\n", - "\n", - "def view_bayes_flood(sig0_dc, calc_posteriors=None, bayesian_flood_decision=None):\n", - "\n", - " # initialize a map on top\n", - " m = Maps(ax=122, layer=\"data\", crs=Maps.CRS.Equi7_EU)\n", - "\n", - " # initialize 2 matplotlib plot-axes next to the map\n", - " ax_upper = m.f.add_subplot(221)\n", - " ax_upper.set_ylabel(\"likelihood\")\n", - " ax_upper.set_xlabel(\"$\\sigma^0 (dB)$\")\n", - "\n", - " ax_lower = m.f.add_subplot(223)\n", - " ax_lower.set_ylabel(\"probability\")\n", - " ax_lower.set_xlabel(\"$\\sigma^0 (dB)$\")\n", - "\n", - " # -------- assign data to the map and plot it\n", - " if bayesian_flood_decision is not None:\n", - " # add map\n", - " m2 = m.new_layer(layer=\"map\")\n", - " m2.add_wms.OpenStreetMap.add_layer.default()\n", - " flood_classification = bayesian_flood_decision(sig0_dc.id, sig0_dc.SIG0)\n", - " sig0_dc[\"decision\"] = (('y', 'x'), flood_classification.reshape(sig0_dc.SIG0.shape))\n", - " sig0_dc[\"decision\"] = sig0_dc.decision.where(sig0_dc.SIG0.notnull())\n", - " sig0_dc[\"decision\"] = sig0_dc.decision.where(sig0_dc.decision==0)\n", - " m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"decision\", crs=Maps.CRS.Equi7_EU)\n", - " m.plot_map()\n", - " m.show_layer(\"map\", (\"data\", 0.5))\n", - " m.apply_layout(\n", - " {\n", - " 'figsize': [7.32, 4.59],\n", - " '0_map': [0.44573, 0.11961, 0.3375, 0.75237],\n", - " '1_': [0.10625, 0.5781, 0.3125, 0.29902],\n", - " '2_': [0.10625, 0.11961, 0.3125, 0.29902],\n", - " }\n", - " )\n", - "\n", - " else:\n", - " m.set_data(data=sig0_dc, x=\"x\", y=\"y\", parameter=\"SIG0\", crs=Maps.CRS.Equi7_EU)\n", - " m.plot_map()\n", - " m.add_colorbar(label=\"$\\sigma^0$ (dB)\", orientation=\"vertical\", hist_bins=30)\n", - " m.apply_layout(\n", - " {\n", - " 'figsize': [7.32, 4.59],\n", - " '0_map': [0.44573, 0.11961, 0.3375, 0.75237],\n", - " '1_': [0.10625, 0.5781, 0.3125, 0.29902],\n", - " '2_': [0.10625, 0.11961, 0.3125, 0.29902],\n", - " '3_cb': [0.8, 0.09034, 0.1, 0.85],\n", - " '3_cb_histogram_size': 0.8\n", - " }\n", - " )\n", - "\n", - " def update_plots(ID, **kwargs):\n", - " \n", - " # get the data\n", - " value = sig0_dc.where(sig0_dc.id == ID, drop=True).SIG0.to_numpy()\n", - " y1_pdf, y2_pdf = calc_water_likelihood(ID, RANGE), calc_land_likelihood(ID, RANGE)\n", - "\n", - " # plot the lines and vline\n", - " (water,) = ax_upper.plot(RANGE, y1_pdf, 'k-', lw=2, label=\"water\")\n", - " (land,) = ax_upper.plot(RANGE, y2_pdf,'r-', lw=5, alpha=0.6, label=\"land\")\n", - " value_left = ax_upper.vlines(x=value, ymin=0, ymax=np.max((y1_pdf, y2_pdf)), lw=3, label=\"observed\")\n", - " ax_upper.legend(loc=\"upper left\")\n", - "\n", - " # add all artists as \"temporary pick artists\" so that they\n", - " # are removed when the next datapoint is selected\n", - " for a in [water, land, value_left]:\n", - " m.cb.pick.add_temporary_artist(a)\n", - "\n", - " if calc_posteriors is not None:\n", - " f_post, nf_post = calc_posteriors(y1_pdf, y2_pdf)\n", - " (f,) = ax_lower.plot(RANGE, f_post, 'k-', lw=2, label=\"flood\")\n", - " (nf,) = ax_lower.plot(RANGE, nf_post,'r-', lw=5, alpha=0.6, label=\"non-flood\")\n", - " value_right = ax_lower.vlines(x=value, ymin=-0.1, ymax=1.1, lw=3, label=\"observed\")\n", - " ax_lower.legend(loc=\"upper left\")\n", - " for a in [f, nf, value_right]:\n", - " m.cb.pick.add_temporary_artist(a)\n", - "\n", - " # re-compute axis limits based on the new artists\n", - " ax_upper.relim()\n", - " ax_upper.autoscale()\n", - "\n", - " m.cb.pick.attach(update_plots)\n", - " m.cb.pick.attach.mark(permanent=False, buffer=1, fc=\"none\", ec=\"r\")\n", - " m.cb.pick.attach.mark(permanent=False, buffer=2, fc=\"none\", ec=\"r\", ls=\":\")\n", - " m.show()\n", - "\n", - "view_bayes_flood(sig0_dc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Posteriors\n", - "\n", - "Having calculated the likelihoods, we can now move on to calculate the probability of (non-)flooding given a pixel's $\\sigma^0$. These so-called *posteriors* need one more piece of information, as can be seen in the equation above. We need the probability that a pixel is flooded $P(F)$ or not flooded $P(NF)$. Of course, these are the figures we've been trying to find this whole time. We don't actually have them yet, so what can we do? In Bayesian statistics, we can just start with our best guess. These guesses are called our \"priors\", because they are the beliefs we hold *prior* to looking at the data. This subjective prior belief is the foundation Bayesian statistics, and we use the likelihoods we just calculated to update our belief in this particular hypothesis. This updated belief is called the \"posterior\".\n", - "\n", - "Let's say that our best estimate for the chance of flooding versus non-flooding of a pixel is 50-50: a coin flip. We now can also calculate the probability of backscattering $P(\\sigma^0)$, as the weighted average of the water and land likelihoods, ensuring that our posteriors range between 0 to 1.\n", - "\n", - "The following code block shows how we calculate the priors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_posteriors(water_likelihood, land_likelihood):\n", - " evidence = (water_likelihood * 0.5) + (land_likelihood * 0.5)\n", - " return (water_likelihood * 0.5) / evidence, (land_likelihood * 0.5) / evidence" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can plot the posterior probabilities of flooding and non-flooding again and compare these to pixel's measured $\\sigma^0$. **Click** on a pixel to calculate the posterior probability." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| label: fig-post\n", - "#| fig-cap: 'Posterior probabilities for $\\sigma^0$ of 1 pixel being associated with land for water in the Greek area of Thessaly. Click on the map to re-calculate and update this figure for another pixel in the study area. Map created with EOmaps [@quast_getting_2024].'\n", - "view_bayes_flood(sig0_dc, calc_posteriors)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Flood Classification\n", - "\n", - "We are now ready to combine all this information and classify the pixels according to the probability of flooding given the backscatter value of each pixel. Here we just look whether the probability of flooding is higher than non-flooding:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def bayesian_flood_decision(id, sig0_dc):\n", - " nf_post_prob, f_post_prob = calc_posteriors(*calc_likelihoods(id, sig0_dc))\n", - " return np.greater(f_post_prob, nf_post_prob)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Click** on a point in the below map to see the likelihoods and posterior distributions (in the left-hand subplots)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#| label: fig-clas\n", - "#| fig-cap: 'Flood extent of the Greek region of Thessaly based on Bayesian probabilities are shown on the map superimposed on an open street map. Click on a pixel to generate the point''s water and land likelihoods as well as the posterior probabilities. Map created with EOmaps [@quast_getting_2024].'\n", - "view_bayes_flood(sig0_dc, calc_posteriors, bayesian_flood_decision)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "02_floodmapping", - "language": "python", - "name": "02_floodmapping" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/notebooks/Makefile b/notebooks/Makefile index 84f16e8..9589654 100644 --- a/notebooks/Makefile +++ b/notebooks/Makefile @@ -38,7 +38,6 @@ post-render: for i in $(QMD); do quarto convert $$i; done - mv chapters/*.ipynb notebooks/ >/dev/null 2>&1 - for f in chapters/*.quarto_ipynb ; do mv -- "$f" "${f%.quarto_ipynb}.ipynb" >/dev/null 2>&1; done - python src/clean-nb.py cp Makefile notebooks/ data: diff --git a/notebooks/references.ipynb b/notebooks/references.ipynb index f1c7eb5..b66ff11 100644 --- a/notebooks/references.ipynb +++ b/notebooks/references.ipynb @@ -1,29 +1,24 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# References\n", - "\n", - "Bauer-Marschallinger, Bernhard, Senmao Cao, Mark Edwin Tupas, Florian Roth, Claudio Navacchi, Thomas Melzer, Vahid Freeman, and Wolfgang Wagner. 2022. “Satellite-Based Flood Mapping Through Bayesian Inference from a Sentinel-1 SAR Datacube.” Remote Sensing 14 (15): 3673. https://doi.org/10.3390/rs14153673.\n", - "\n", - "NASA. 2020. “Earth Observatory.” 2020. https://earthobservatory.nasa.gov/features/MeasuringVegetation/measuring_vegetation_2.php.\n", - "\n", - "Quast, Raphael. n.d. “EOmaps: A Python Package to Visualize and Analyze Geographical Datasets.” https://doi.org/10.5281/zenodo.6459598.\n", - "\n", - "Rouse, John Wilson, Rüdiger H Haas, John A Schell, Donald W Deering, et al. 1974. “Monitoring Vegetation Systems in the Great Plains with ERTS.” NASA Spec. Publ 351 (1): 309." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3", - "path": "/home/runner/.local/share/jupyter/kernels/python3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# References {.unnumbered}\n", + "\n", + "::: {#refs}\n", + ":::" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)", + "path": "/home/runner/.local/share/jupyter/kernels/python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7fd26b9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5a13289 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,34 @@ +[metadata] +name = eo_datascience +version = attr: eo_datascience.__version__ +description = Examples of TUWien Jupyter notebooks for education +author = TU Wien GEO MRS group +author_email = martin.schobben@geo.tuwien.ac.at +long_description = file: README.md +url = https://github.com/TUW-GEO/eo-datascience +platforms = any +classifiers = + Intended Audience :: Science/Research + Topic :: Scientific/Engineering + Topic :: Scientific/Engineering :: GIS + Topic :: Software Development :: Libraries + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Operating System :: POSIX + Natural Language :: English + +[options] +package_dir = + = src +packages = find: +install_requires = + nbformat + +[options.packages.find] +where = src + +[options.extras_require] +test = + pytest + pytest-cov diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6068493 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup() diff --git a/src/clean-nb.py b/src/clean-nb.py deleted file mode 100644 index dff6e1e..0000000 --- a/src/clean-nb.py +++ /dev/null @@ -1,73 +0,0 @@ -import os -import nbformat -from pathlib import Path -from bs4 import BeautifulSoup - - -def clean_up_frontmatter(): - # Define the path to the notebooks - root = Path('./notebooks').resolve() - nb_paths = [root / file for file in os.listdir(root) if file.endswith('.ipynb')] - - # Iterate over the notebooks - for nb_path in nb_paths: - # Load the notebook - nb = nbformat.read(nb_path, as_version=4) - if nb.cells[0].source.startswith('---'): - #Load frontmatter - fm = nb.cells[0].source.split('\n') - - # Extract the title and the subtitle - title, subtitle = '', '' - for line in fm: - if line.startswith('title'): - title = line.split(': ')[1] - if line.startswith('subtitle'): - subtitle = line.split(': ')[1] - - # Update the cell - nb.cells[0].source = f'# {title}\n{subtitle}\n' - - # Save the notebook - nbformat.write(nb, nb_path) - -def clean_up_references(): - # Load the references.html file - html_file_path = Path('_book/chapters/references.html') - with open(html_file_path, 'r', encoding='utf-8') as file: - html_content = file.read() - - # Parse the HTML content - soup = BeautifulSoup(html_content) - references_div = soup.find('div', {'id': 'refs', 'class': 'references csl-bib-body hanging-indent'}) - - # Format the references as string - references_list = [] - for ref in references_div.get_text().split('\n\n\n'): - ref = ref.replace('\n\n', '') - ref = ref.replace('\n', ' ') - references_list.append(ref) - - # Indent the references - #ref_list = ['\t' + ref for ref in references_list] - - # Merge the references into a single string - output_str = '\n\n'.join(references_list) - - # Load the References notebook - ref_nb_path = Path('./notebooks/references.ipynb').resolve() - nb = nbformat.read(ref_nb_path, as_version=4) - - # Update the cell - nb.cells[0].source = f'# References\n\n{output_str}' - - # Save the notebook - nbformat.write(nb, ref_nb_path) - -def main(): - clean_up_frontmatter() - clean_up_references() - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/src/eo_datascience/__init__.py b/src/eo_datascience/__init__.py new file mode 100644 index 0000000..09ba5cd --- /dev/null +++ b/src/eo_datascience/__init__.py @@ -0,0 +1,4 @@ +from eo_datascience._version import __commit__ +from eo_datascience._version import __version__ + +name = "eo_datascience" \ No newline at end of file diff --git a/src/eo_datascience/_version.py b/src/eo_datascience/_version.py new file mode 100644 index 0000000..c55373b --- /dev/null +++ b/src/eo_datascience/_version.py @@ -0,0 +1,2 @@ +__version__ = "v0.0.0" +__commit__ = "0000000" diff --git a/src/eo_datascience/clean_nb.py b/src/eo_datascience/clean_nb.py new file mode 100644 index 0000000..e792ace --- /dev/null +++ b/src/eo_datascience/clean_nb.py @@ -0,0 +1,129 @@ +import os +import nbformat +from pathlib import Path +import re + +def clean_up_frontmatter(dir = './notebooks', save=True): + # Define the path to the notebooks + nb_paths = find_ipynb(dir) + + # Iterate over the notebooks + for nb_path in nb_paths: + # Load the notebook + nb = nbformat.read(nb_path, as_version=4) + if nb.cells[0].source.startswith('---'): + #Load frontmatter + fm = nb.cells[0].source.split('\n') + + # Extract the title and the subtitle and convert + i = 1 + line = fm[i] + new_text = [] + while not line.startswith("---"): + if line.startswith('title'): + new_text.append(f"# {line.split(': ')[1]}") + if line.startswith('subtitle'): + new_text.append(f"**{line.split(': ')[1]}**") + + i += 1 + line = fm[i] + + new_text += fm[i+1:] + nb.cells[0].source = "\n".join(new_text) + "\n" + # Save notebook + if save: + nbformat.write(nb, nb_path) + else: + return nb + +def convert_bibliography(nb_path="./notebooks/references.ipynb", save=True): + nb = nbformat.read(nb_path, as_version=4) + nb.cells[0].source = """# References + +```{bibliography} +``` +""" + # Save the notebook + if save: + nbformat.write(nb, nb_path) + else: + return nb + +def convert_callout_notes(dir="./notebooks", save=True): + nb_paths = find_ipynb(dir) + + # Iterate over the notebooks + for nb_path in nb_paths: + # Load the notebook + nb = nbformat.read(nb_path, as_version=4) + for i in range(len(nb.cells)): + + if nb.cells[i]["cell_type"] == "markdown": + nb.cells[i].source = quarto_note_replace(nb.cells[i].source) + + # Save the notebook + if save: + nbformat.write(nb, nb_path) + else: + return nb + +def quarto_note_replace(quarto): + note_rst_start = r":::{note}" + note_rst_end = r":::" + nts = re.findall(r"(?<=:::\s\{\.callout\-note\})[^:::]+", quarto) + for i in nts: + quarto = re.sub(r":::\s\{\.callout\-note\}" + re.escape(i) + r":::", note_rst_start + i + note_rst_end, quarto) + return quarto + +def convert_refs(dir="./notebooks", save=True): + nb_paths = find_ipynb(dir) + + # Iterate over the notebooks + for nb_path in nb_paths: + # Load the notebook + nb = nbformat.read(nb_path, as_version=4) + for i in range(len(nb.cells)): + if i != 0: + if nb.cells[i]["cell_type"] == "markdown": + nb.cells[i].source = quarto_ref_figure_replace(nb.cells[i].source) + nb.cells[i].source = quarto_ref_person_replace(nb.cells[i].source) + nb.cells[i].source = quarto_ref_time_replace(nb.cells[i].source) + + # Save the notebook + if save: + nbformat.write(nb, nb_path) + else: + return nb + +def quarto_ref_figure_replace(quarto): + bibs = re.findall(r"(?<=\(\@)[^\)]+", quarto) + for i in bibs: + quarto = re.sub(r"\(\@" + i + "\)", r"", quarto) + return quarto + +def quarto_ref_person_replace(quarto): + bibs = re.findall(r"(?<=\[\@)[^\]]+", quarto) + for i in bibs: + quarto = re.sub(r"\[\@" + i + "\]", r"{cite:p}`" + i + "`", quarto) + return quarto + +def quarto_ref_time_replace(quarto): + bibs = re.findall(r"(?<=\@)[^\s]+", quarto) + for i in bibs: + quarto = re.sub(r"\@" + i, r"{cite:t}`" + i + "`", quarto) + return quarto + +def find_ipynb(dir): + root = Path(dir).resolve() + nb_paths = [root / file for file in os.listdir(root) if file.endswith('.ipynb')] + return nb_paths + +def main(): + clean_up_frontmatter() + convert_callout_notes() + convert_refs() + convert_bibliography() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/eo_datascience/render_sfinx_toc.py b/src/eo_datascience/render_sfinx_toc.py new file mode 100644 index 0000000..b4aedea --- /dev/null +++ b/src/eo_datascience/render_sfinx_toc.py @@ -0,0 +1,26 @@ +from pathlib import Path +import yaml + +def render_toc(): + p = Path("notebooks").glob('*.ipynb') + files = [x.with_suffix('') for x in p if x.is_file()] + files.sort() + titles = [x.stem.split("_")[-1].title() for x in files] + ls = [dict(caption="Preamble", chapters=[dict(file="notebooks/how-to-cite")])] + for i in range(len(titles)): + ls.append(dict(caption=titles[i], chapters=[dict(file=str(files[i]))])) + + toc = dict( + format="jb-book", + root="README", + parts=ls + ) + + with open('_toc.yml', 'w+') as ff: + yaml.dump(toc, ff) + +def main(): + render_toc() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/render_sfinx_toc.py b/src/render_sfinx_toc.py deleted file mode 100644 index 7c34ae2..0000000 --- a/src/render_sfinx_toc.py +++ /dev/null @@ -1,19 +0,0 @@ -from pathlib import Path -import yaml - -p = Path("notebooks").glob('*.ipynb') -files = [x.with_suffix('') for x in p if x.is_file()] -titles = [x.stem.split("_")[-1].title() for x in files] -ls = [] -for i in range(len(titles)): - ls.append(dict(caption=titles[i], chapters=[dict(file=str(files[i]))])) -ls.append(dict(caption="Preamble", chapters=[dict(file="notebooks/how-to-cite")])) -ls.reverse() -toc = dict( - format="jb-book", - root="README", - parts=ls -) - -with open('_toc.yml', 'w+') as ff: - yaml.dump(toc, ff) \ No newline at end of file diff --git a/tests/mock.ipynb b/tests/mock.ipynb new file mode 100644 index 0000000..8a45aca --- /dev/null +++ b/tests/mock.ipynb @@ -0,0 +1,40 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: This a mock Jupyter file\n", + "subtitle: We use it for testing\n", + "author: anonymous\n", + "---\n", + "\n", + "Some other text, which should not be deleted!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "::: {.callout-note}\n", + "This a callout note.\n", + ":::" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "lorem ipsum [@anon2024] and [@anon2025] and lorem ipsum @anon2024 and @anon2025" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_quarto_nb_conversions.py b/tests/test_quarto_nb_conversions.py new file mode 100644 index 0000000..95451b6 --- /dev/null +++ b/tests/test_quarto_nb_conversions.py @@ -0,0 +1,20 @@ +import nbformat +from pathlib import Path +import pytest +from eo_datascience.clean_nb import clean_up_frontmatter, convert_refs, quarto_ref_person_replace, quarto_ref_time_replace, \ + convert_callout_notes, quarto_note_replace + +def test_remove_front_matter(): + assert clean_up_frontmatter("./tests", False)["cells"][0]["source"] == '# This a mock Jupyter file\n**We use it for testing**\n\nSome other text, which should not be deleted!\n' + +def test_conversion_of_refs(): + quarto = [r"lorem ipsum [@anon2024] and [@anon2025]", r"lorem ipsum @anon2024 and @anon2025"] + quarto[0] = quarto_ref_person_replace(quarto[0]) + quarto[1] = quarto_ref_time_replace(quarto[1]) + assert quarto == [r"lorem ipsum {cite:p}`anon2024` and {cite:p}`anon2025`", r"lorem ipsum {cite:t}`anon2024` and {cite:t}`anon2025`"] + assert convert_refs("./tests", False)["cells"][2]["source"] == r"lorem ipsum {cite:p}`anon2024` and {cite:p}`anon2025` and lorem ipsum {cite:t}`anon2024` and {cite:t}`anon2025`" + +def test_conversion_of_callout_notes(): + rst = ':::{note}\nThis a callout note.\n:::' + assert quarto_note_replace(r"::: {.callout-note}\nThis a callout note.\n:::") == rst + assert convert_callout_notes("./tests", False)["cells"][1]["source"] == rst