From 977f6456131a600d8b8c3b7962018b9a44b32004 Mon Sep 17 00:00:00 2001 From: Matic Lubej Date: Wed, 24 Jan 2024 08:04:50 +0100 Subject: [PATCH 1/3] add pandas parquet loader and update vector comparison (#325) Co-authored-by: Matic Lubej --- eogrow/utils/testing.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/eogrow/utils/testing.py b/eogrow/utils/testing.py index f2ab44ba..9b3a39e0 100644 --- a/eogrow/utils/testing.py +++ b/eogrow/utils/testing.py @@ -14,11 +14,12 @@ import fs import geopandas as gpd import numpy as np +import pandas as pd import rasterio from deepdiff import DeepDiff from fs.base import FS from fs.osfs import OSFS -from shapely import MultiPolygon, Point, Polygon +from shapely import MultiPolygon, Point, Polygon, wkb, wkt from eolearn.core import EOPatch, FeatureType from eolearn.core.eodata_io import get_filesystem_data_info @@ -90,13 +91,26 @@ def calculate_statistics(folder: str, config: StatCalcConfig) -> JsonDict: elif content_path.endswith((".geojson", ".gpkg")): stats[content] = _calculate_vector_stats(gpd.read_file(content_path), config) elif content_path.endswith(".parquet"): - stats[content] = _calculate_vector_stats(gpd.read_parquet(content_path), config) + try: + data = gpd.read_parquet(content_path) + except Exception: + data = _load_as_geoparquet(content_path) + stats[content] = _calculate_vector_stats(data, config) else: stats[content] = None return stats +def _load_as_geoparquet(path: str) -> gpd.GeoDataFrame: + data = pd.read_parquet(path) + if isinstance(data.geometry.iloc[0], str): + data.geometry = data.geometry.apply(wkt.loads) + elif isinstance(data.geometry.iloc[0], bytes): + data.geometry = data.geometry.apply(wkb.loads) + return gpd.GeoDataFrame(data, geometry="geometry", crs=data.utm_crs.iloc[0]) + + def _calculate_eopatch_stats(eopatch: EOPatch, config: StatCalcConfig) -> JsonDict: """Calculates statistics of given EOPatch and it's content""" stats: JsonDict = defaultdict(dict) From 619d922642c9b619a3eaf4273a0402ab238798e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Mon, 29 Jan 2024 13:14:12 +0100 Subject: [PATCH 2/3] Fix moto and pytest after updates (#326) --- pyproject.toml | 3 +-- tests/core/area/test_utm.py | 29 ++++++++++++----------------- tests/core/test_logging.py | 6 +++--- tests/utils/test_filter.py | 4 ++-- 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c19ee298..a5f846ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,12 +75,11 @@ dev = [ "build", "deepdiff", "fs_s3fs", - "moto", + "moto[s3]>=5.0.0", "mypy>=0.990", "pre-commit", "pyogrio", "pytest-cov", - "pytest-lazy-fixture", "pytest-order", "pytest>=4.0.0", "requests-mock", diff --git a/tests/core/area/test_utm.py b/tests/core/area/test_utm.py index 791b6b7d..90736bba 100644 --- a/tests/core/area/test_utm.py +++ b/tests/core/area/test_utm.py @@ -5,28 +5,23 @@ from eogrow.core.area import UtmZoneAreaManager +LARGE_AREA_CONFIG = { + "geometry_filename": "test_large_area.geojson", + "patch": {"size_x": 1000000, "size_y": 1000000, "buffer_x": 0, "buffer_y": 0}, +} -@pytest.fixture(scope="session", name="large_area_config") -def large_area_config_fixture(): - return { - "geometry_filename": "test_large_area.geojson", - "patch": {"size_x": 1000000, "size_y": 1000000, "buffer_x": 0, "buffer_y": 0}, - } - -@pytest.fixture(scope="session", name="area_config") -def area_config_fixture(): - return { - "geometry_filename": "test_area.geojson", - "patch": {"size_x": 2400, "size_y": 1100, "buffer_x": 120, "buffer_y": 55}, - } +AREA_CONFIG = { + "geometry_filename": "test_area.geojson", + "patch": {"size_x": 2400, "size_y": 1100, "buffer_x": 120, "buffer_y": 55}, +} @pytest.mark.parametrize( ("config", "expected_zone_num", "expected_bbox_num"), [ - (pytest.lazy_fixture("area_config"), 1, 2), - (pytest.lazy_fixture("large_area_config"), 71, 368), + (AREA_CONFIG, 1, 2), + (LARGE_AREA_CONFIG, 71, 368), ], ) def test_bbox_split(storage, config, expected_zone_num, expected_bbox_num): @@ -46,7 +41,7 @@ def test_bbox_split(storage, config, expected_zone_num, expected_bbox_num): assert bbox_count == expected_bbox_num -def test_cache_name(storage, area_config): - area_manager = UtmZoneAreaManager.from_raw_config(area_config, storage) +def test_cache_name(storage): + area_manager = UtmZoneAreaManager.from_raw_config(AREA_CONFIG, storage) assert area_manager.get_grid_cache_filename() == "UtmZoneAreaManager_test_area_2400_1100_120.0_55.0_0.0_0.0.gpkg" diff --git a/tests/core/test_logging.py b/tests/core/test_logging.py index 55f3fed2..2213ed93 100644 --- a/tests/core/test_logging.py +++ b/tests/core/test_logging.py @@ -4,7 +4,7 @@ import pytest from fs.tempfs import TempFS from fs_s3fs import S3FS -from moto import mock_s3 +from moto import mock_aws from eolearn.core import EOExecutor, EONode, EOTask, EOWorkflow @@ -17,7 +17,7 @@ def execute(self, *_, value=0): raise ValueError(f"Value is {value}") -@mock_s3 +@mock_aws def _create_new_s3_fs(): """Creates a new empty mocked s3 bucket. If one such bucket already exists it deletes it first.""" bucket_name = "mocked-test-bucket" @@ -35,7 +35,7 @@ def _create_new_s3_fs(): return S3FS(bucket_name=bucket_name) -@mock_s3 +@mock_aws @pytest.mark.parametrize("fs_loader", [TempFS, _create_new_s3_fs]) @pytest.mark.parametrize( "logs_handler_factory", [EOExecutionHandler, functools.partial(RegularBackupHandler, backup_interval=0.01)] diff --git a/tests/utils/test_filter.py b/tests/utils/test_filter.py index 2b9d5746..68aeaa75 100644 --- a/tests/utils/test_filter.py +++ b/tests/utils/test_filter.py @@ -6,7 +6,7 @@ import pytest from fs.tempfs import TempFS from fs_s3fs import S3FS -from moto import mock_s3 +from moto import mock_aws from eolearn.core import EOPatch, FeatureType from sentinelhub import CRS, BBox @@ -41,7 +41,7 @@ def eopatch_fixture(): @pytest.fixture(name="mock_s3fs", scope="session") def mock_s3fs_fixture(eopatch): - with mock_s3(): + with mock_aws(): s3resource = boto3.resource("s3", region_name="eu-central-1") s3resource.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration={"LocationConstraint": "eu-central-1"}) mock_s3fs = S3FS(BUCKET_NAME) From 28140924dcd11decfd437d31b9e0805634961e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:10:44 +0100 Subject: [PATCH 3/3] Remove automatic pipeline retries and add better timestamp parsing support (#327) * disable pipeline retries in pipeline chains * add better timestamp parsing support * update and rerun pre-commit * update init and changelog * add dateutil * add stubs * correct name --- .pre-commit-config.yaml | 4 ++-- CHANGELOG.md | 8 +++++++- eogrow/__init__.py | 2 +- eogrow/types.py | 3 +-- eogrow/utils/pipeline_chain.py | 2 +- eogrow/utils/validators.py | 8 ++++---- pyproject.toml | 3 +++ tests/core/area/test_batch.py | 1 + tests/utils/test_validators.py | 17 +++++++++-------- 9 files changed, 29 insertions(+), 19 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6a708d55..a48bc256 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,13 +20,13 @@ repos: types_or: [json] - repo: https://github.com/psf/black - rev: 23.12.1 + rev: 24.1.1 hooks: - id: black language_version: python3 - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: "v0.1.11" + rev: "v0.1.14" hooks: - id: ruff diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ed7e750..481e032e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,10 @@ -## [Version 1.7.4] - 2024-01-10 +## [Version 1.7.6] - 2024-01-29 + +- Pipelines that are run as part of a pipeline-chain execution will now no longer be retried by ray in the case when an exception occurs. +- Parsing time ranges now has support for more formats. + + +## [Version 1.7.5] - 2024-01-10 - Parameter `raise_if_failed` renamed to `raise_on_failure` and is now enabled by default. - Numpy version restricted in anticipation of numpy 2.0 release. diff --git a/eogrow/__init__.py b/eogrow/__init__.py index 8030e953..e014549f 100644 --- a/eogrow/__init__.py +++ b/eogrow/__init__.py @@ -1,3 +1,3 @@ """The main module of the eo-grow package.""" -__version__ = "1.7.5" +__version__ = "1.7.6" diff --git a/eogrow/types.py b/eogrow/types.py index 7f543dc2..d66ce1f0 100644 --- a/eogrow/types.py +++ b/eogrow/types.py @@ -1,5 +1,4 @@ -""" Includes custom types used in schemas -""" +"""Includes custom types used in schemas""" import datetime import sys diff --git a/eogrow/utils/pipeline_chain.py b/eogrow/utils/pipeline_chain.py index ecde54de..f876c372 100644 --- a/eogrow/utils/pipeline_chain.py +++ b/eogrow/utils/pipeline_chain.py @@ -44,6 +44,6 @@ def run_pipeline_chain(pipeline_chain: list[RawConfig]) -> None: ray.get(runner.remote(run_schema.pipeline_config)) -@ray.remote +@ray.remote(max_retries=0) def _pipeline_runner(config: RawConfig) -> None: return load_pipeline_class(config).from_raw_config(config).run() diff --git a/eogrow/utils/validators.py b/eogrow/utils/validators.py index 4e050a27..5060fc9c 100644 --- a/eogrow/utils/validators.py +++ b/eogrow/utils/validators.py @@ -4,11 +4,11 @@ from __future__ import annotations -import datetime as dt import inspect from typing import TYPE_CHECKING, Any, Callable, Iterable, Tuple, Union import numpy as np +from dateutil.parser import isoparse from pydantic import BaseModel, Field, validator from eolearn.core import FeatureType @@ -144,8 +144,8 @@ def parse_time_period(value: tuple[str, str]) -> TimePeriod: } value = start_dates[kind], end_dates[kind] - start = dt.datetime.strptime(value[0], "%Y-%m-%d").date() - end = dt.datetime.strptime(value[1], "%Y-%m-%d").date() + start = isoparse(value[0]) + end = isoparse(value[1]) assert start <= end, "Invalid start and end dates provided. End date must follow the start date" return start, end @@ -241,7 +241,7 @@ def parse_data_collection(value: str | dict | DataCollection) -> DataCollection: def restrict_types( - allowed_feature_types: Iterable[FeatureType] | Callable[[FeatureType], bool] + allowed_feature_types: Iterable[FeatureType] | Callable[[FeatureType], bool], ) -> Callable[[Feature], Feature]: """Validates a field representing a feature, where it restricts the possible feature types.""" diff --git a/pyproject.toml b/pyproject.toml index a5f846ee..3254e96c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ dependencies = [ "opencv-python-headless", "pandas", "pydantic>=1.8.0, <2.0", + "python-dateutil", "python-rapidjson", "rasterio", "ray[default]", @@ -72,6 +73,7 @@ docs = [ dev = [ "eo-grow[ML]", "boto3", + "boto3-stubs", "build", "deepdiff", "fs_s3fs", @@ -85,6 +87,7 @@ dev = [ "requests-mock", "scipy", "twine", + "types-python-dateutil", "types-mock", "types-requests", "types-setuptools", diff --git a/tests/core/area/test_batch.py b/tests/core/area/test_batch.py index 3f8b7bca..2c49a073 100644 --- a/tests/core/area/test_batch.py +++ b/tests/core/area/test_batch.py @@ -7,6 +7,7 @@ - Batch request definition endpoint. - Tiling grid request endpoints. - Mocking requests of iter_tiles would be too much effort, so the `_make_new_split` of the splitter is mocked instead. + """ from unittest.mock import patch diff --git a/tests/utils/test_validators.py b/tests/utils/test_validators.py index 72d7dfeb..87afb2c2 100644 --- a/tests/utils/test_validators.py +++ b/tests/utils/test_validators.py @@ -170,18 +170,19 @@ class DummySchema(Pipeline.Schema): @pytest.mark.parametrize( - ("time_period", "year", "expected_start_date", "expected_end_date"), + ("first_param", "second_param", "expected_start_date", "expected_end_date"), [ - ("yearly", 2020, "2020-01-01", "2020-12-31"), - ("Q2", 2021, "2021-04-01", "2021-06-30"), - ("Q2-yearly", 2021, "2020-07-01", "2021-06-30"), + ("yearly", 2020, "2020-01-01T00:00:00", "2020-12-31T00:00:00"), + ("Q2", 2021, "2021-04-01T00:00:00", "2021-06-30T00:00:00"), + ("Q2-yearly", 2021, "2020-07-01T00:00:00", "2021-06-30T00:00:00"), + ("2022-02-02", "2022-02-22T22:22:02", "2022-02-02T00:00:00", "2022-02-22T22:22:02"), ], ) -def test_parse_time_period(time_period, year, expected_start_date, expected_end_date): - start_date, end_date = parse_time_period([time_period, year]) +def test_parse_time_period(first_param, second_param, expected_start_date, expected_end_date): + start_date, end_date = parse_time_period([first_param, second_param]) - assert isinstance(start_date, dt.date) - assert isinstance(end_date, dt.date) + assert isinstance(start_date, dt.datetime) + assert isinstance(end_date, dt.datetime) assert start_date.isoformat() == expected_start_date assert end_date.isoformat() == expected_end_date