Skip to content

Commit

Permalink
import tiffs now lets rasterio do it's thing
Browse files Browse the repository at this point in the history
  • Loading branch information
zigaLuksic committed Nov 2, 2023
1 parent a0d6b28 commit 15a15eb
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 74 deletions.
69 changes: 20 additions & 49 deletions eolearn/io/raster_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,13 @@
import logging
import warnings
from abc import ABCMeta
from typing import BinaryIO

import fs
import numpy as np
import rasterio
import rasterio.warp
from affine import Affine
from fs.base import FS
from fs.osfs import OSFS
from fs.tempfs import TempFS
from fs_s3fs import S3FS
from rasterio.io import DatasetReader
from rasterio.session import AWSSession
Expand All @@ -32,7 +29,7 @@

from eolearn.core import EOPatch
from eolearn.core.core_tasks import IOTask
from eolearn.core.exceptions import EORuntimeWarning
from eolearn.core.exceptions import EODeprecationWarning, EORuntimeWarning
from eolearn.core.types import Feature
from eolearn.core.utils.fs import get_base_filesystem_and_path, get_full_path

Expand Down Expand Up @@ -411,7 +408,7 @@ def __init__(
feature: Feature,
folder: str,
*,
use_vsi: bool = False,
use_vsi: bool | None = None,
timestamp_size: int | None = None,
filesystem: FS | None = None,
image_dtype: np.dtype | type | None = None,
Expand All @@ -421,11 +418,7 @@ def __init__(
"""
:param feature: EOPatch feature into which data will be imported
:param folder: A directory containing image files or a path of an image file
:param use_vsi: A flag to define if reading should be done with GDAL/rasterio virtual system (VSI)
functionality. The flag only has an effect when the task is used to read an image from a remote storage
(i.e. AWS S3 bucket). For a performance improvement it is recommended to set this to `True` when reading a
smaller chunk of a larger image, especially if it is a Cloud-optimized GeoTIFF (COG). In other cases the
reading might be faster if the flag remains set to `False`.
:param use_vsi: Deprecated.
:param timestamp_size: In case data will be imported into a time-dependant feature this parameter can be used to
specify time dimension. If not specified, time dimension will be the same as size of the timestamps
attribute. If timestamps do not exist this value will be set to 1. When converting data into a feature
Expand All @@ -448,49 +441,27 @@ def __init__(
config=config,
)

self.use_vsi = use_vsi
if use_vsi is not None:
warnings.warn(
"The parameter `use_vsi` has been deprecated and has no effect.", EODeprecationWarning, stacklevel=2
)
self.timestamp_size = timestamp_size

def _get_session(self, filesystem: FS) -> AWSSession:
"""Creates a session object with credentials from a config object."""
if not isinstance(filesystem, S3FS):
raise NotImplementedError("A rasterio session for VSI reading for now only works for AWS S3 filesystems")

return AWSSession(
aws_access_key_id=filesystem.aws_access_key_id,
aws_secret_access_key=filesystem.aws_secret_access_key,
aws_session_token=filesystem.aws_session_token,
region_name=filesystem.region,
endpoint_url=filesystem.endpoint_url,
)

def _load_from_image(self, path: str, filesystem: FS, bbox: BBox | None) -> tuple[np.ndarray, BBox | None]:
"""The method decides in what way data will be loaded from the image.
"""The method decides in what way data will be loaded from the image."""
full_path = get_full_path(filesystem, path)

env_kwargs = {}
if isinstance(filesystem, S3FS):
env_kwargs["session"] = AWSSession(
aws_access_key_id=filesystem.aws_access_key_id,
aws_secret_access_key=filesystem.aws_secret_access_key,
aws_session_token=filesystem.aws_session_token,
region_name=filesystem.region,
endpoint_url=filesystem.endpoint_url,
)

The method always uses `rasterio.Env` to suppress any low-level warnings. In case of a local filesystem
benchmarks show that without `filesystem.openbin` in some cases `rasterio` can read much faster. Otherwise,
reading depends on `use_vsi` flag. In some cases where a sub-image window is read and the image is in certain
format (e.g. COG), benchmarks show that reading with virtual system (VSI) is much faster. In other cases,
reading with `filesystem.openbin` is faster.
"""
if isinstance(filesystem, (OSFS, TempFS)):
full_path = filesystem.getsyspath(path)
with rasterio.Env():
return self._read_image(full_path, bbox)

if self.use_vsi:
session = self._get_session(filesystem)
with rasterio.Env(session=session):
full_path = get_full_path(filesystem, path)
return self._read_image(full_path, bbox)

with rasterio.Env(), filesystem.openbin(path, "r") as file_handle:
return self._read_image(file_handle, bbox)

def _read_image(self, file_object: str | BinaryIO, bbox: BBox | None) -> tuple[np.ndarray, BBox | None]:
"""Reads data from the image."""
src: DatasetReader
with rasterio.open(file_object) as src:
with rasterio.Env(**env_kwargs), rasterio.open(full_path) as src:
read_window, read_bbox = self._get_reading_window_and_bbox(src, bbox)
boundless_reading = read_window is not None
return src.read(window=read_window, boundless=boundless_reading, fill_value=self.no_data_value), read_bbox
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ dependencies = [
"typing-extensions",
"opencv-python-headless",
"affine",
"rasterio>=1.2.7, <1.3.8",
"rasterio>=1.3.8",
"shapely",
"fiona>=1.8.18",
]
Expand Down
32 changes: 8 additions & 24 deletions tests/io/test_raster_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,11 @@
import warnings
from typing import Any

import boto3
import numpy as np
import pytest
import rasterio
from conftest import TEST_EOPATCH_PATH
from fs.errors import ResourceNotFound
from moto import mock_s3
from numpy.testing import assert_array_equal

from sentinelhub import CRS, BBox, read_data
Expand All @@ -33,17 +31,6 @@

logging.basicConfig(level=logging.DEBUG)

BUCKET_NAME = "mocked-test-bucket"
PATH_ON_BUCKET = f"s3://{BUCKET_NAME}/some-folder"


@pytest.fixture(autouse=True)
def _create_s3_bucket_fixture():
with mock_s3():
s3resource = boto3.resource("s3", region_name="eu-central-1")
s3resource.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration={"LocationConstraint": "eu-central-1"})
yield


@pytest.fixture(autouse=True)
def _ignore_warnings():
Expand Down Expand Up @@ -274,14 +261,11 @@ def test_export2tiff_separate_timestamps(test_eopatch):
assert os.path.exists(expected_path), f"Path {expected_path} does not exist"


# The following is not a proper test of use_vsi parameter. A proper test would require loading from an S3 path however
# moto is not able to mock that because GDAL VSIS is not using boto3.
@pytest.mark.parametrize("use_vsi", [True]) # , False])
def test_import_tiff_subset(test_eopatch, example_data_path, use_vsi):
def test_import_tiff_subset(test_eopatch, example_data_path):
path = os.path.join(example_data_path, "import-tiff-test1.tiff")
mask_feature = FeatureType.MASK_TIMELESS, "TEST_TIF"

task = ImportFromTiffTask(mask_feature, path, use_vsi=use_vsi)
task = ImportFromTiffTask(mask_feature, path)
task(test_eopatch)

tiff_img = read_data(path)
Expand Down Expand Up @@ -321,18 +305,18 @@ def test_import_tiff_intersecting(test_eopatch, example_data_path):
assert unique_values == [no_data_value], f"No data values should all be equal to {no_data_value}"


@pytest.mark.skip() # rasterio 1.3.8 and moto have some issues, wait for it to be resolved
def test_timeless_feature(test_eopatch):
feature = FeatureType.DATA_TIMELESS, "DEM"
filename = "relative-path/my-filename.tiff"

export_task = ExportToTiffTask(feature, folder=PATH_ON_BUCKET)
import_task = ImportFromTiffTask(feature, folder=PATH_ON_BUCKET)
with tempfile.TemporaryDirectory() as tmp_dir_name:
export_task = ExportToTiffTask(feature, folder=tmp_dir_name)
import_task = ImportFromTiffTask(feature, folder=tmp_dir_name)

export_task.execute(test_eopatch, filename=filename)
new_eopatch = import_task.execute(test_eopatch, filename=filename)
export_task.execute(test_eopatch, filename=filename)
new_eopatch = import_task.execute(test_eopatch, filename=filename)

assert_array_equal(new_eopatch[feature], test_eopatch[feature])
assert_array_equal(new_eopatch[feature], test_eopatch[feature])


def test_time_dependent_feature(test_eopatch):
Expand Down

0 comments on commit 15a15eb

Please sign in to comment.