Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support regular parquet in test statistics #330

Merged
merged 21 commits into from
Mar 13, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions eogrow/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from deepdiff import DeepDiff
from fs.base import FS
from fs.osfs import OSFS
from shapely import MultiPolygon, Point, Polygon, wkb, wkt
from shapely import MultiPolygon, Point, Polygon

from eolearn.core import EOPatch, FeatureType
from eolearn.core.eodata_io import get_filesystem_data_info
Expand Down Expand Up @@ -93,24 +93,16 @@ def calculate_statistics(folder: str, config: StatCalcConfig) -> JsonDict:
elif content_path.endswith(".parquet"):
try:
data = gpd.read_parquet(content_path)
stats[content] = _calculate_vector_stats(data, config)
except Exception:
data = _load_as_geoparquet(content_path)
stats[content] = _calculate_vector_stats(data, config)
data = pd.read_parquet(content_path)
stats[content] = _calculate_parquet_stats(data, config)
else:
stats[content] = None

return stats


def _load_as_geoparquet(path: str) -> gpd.GeoDataFrame:
data = pd.read_parquet(path)
if isinstance(data.geometry.iloc[0], str):
data.geometry = data.geometry.apply(wkt.loads)
elif isinstance(data.geometry.iloc[0], bytes):
data.geometry = data.geometry.apply(wkb.loads)
return gpd.GeoDataFrame(data, geometry="geometry", crs=data.utm_crs.iloc[0])


def _calculate_eopatch_stats(eopatch: EOPatch, config: StatCalcConfig) -> JsonDict:
"""Calculates statistics of given EOPatch and it's content"""
stats: JsonDict = defaultdict(dict)
Expand Down Expand Up @@ -217,6 +209,19 @@ def _get_coords_sample(geom: Polygon | MultiPolygon | Any) -> list[tuple[float,
return stats


def _calculate_parquet_stats(data: pd.DataFrame, config: StatCalcConfig) -> JsonDict:
stats = {"columns": list(data), "row_count": len(data)}

if len(data):
subsample: pd.GeoDataFrame = data.sample(min(len(data), config.num_random_values), random_state=42)
zigaLuksic marked this conversation as resolved.
Show resolved Hide resolved
for col in subsample.select_dtypes(include="number").columns.values:
subsample[col] = subsample[col].apply(partial(_prepare_value, dtype=subsample[col].dtype))

subsample_json_string = subsample.to_json(orient="index", date_format="iso")
stats["random_rows"] = json.loads(subsample_json_string)
return stats


def _calculate_basic_stats(values: np.ndarray) -> dict[str, float]:
"""Randomly samples a small amount of points from the array (10% by default) to recalculate the statistics.
This introduces a 'positional instability' so that accidental mirroring or re-orderings are detected."""
Expand Down
Loading