From 86dc65f9ce3899bca1ba85ab796b1d0edc7ea697 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 13:59:25 -0700 Subject: [PATCH 01/13] automatically parse filepaths on hsds, s3, or local and use appropriate file handler including fsspec for zero setup s3 access --- rex/multi_file_resource.py | 19 ++++-- rex/renewable_resource.py | 42 +++++++++---- rex/resource.py | 124 ++++++++++++++++++++++++++++--------- 3 files changed, 139 insertions(+), 46 deletions(-) diff --git a/rex/multi_file_resource.py b/rex/multi_file_resource.py index ee91e00d..2727f1b5 100644 --- a/rex/multi_file_resource.py +++ b/rex/multi_file_resource.py @@ -4,7 +4,7 @@ """ import os from glob import glob -import h5py +import logging import numpy as np from rex.renewable_resource import (NSRDB, SolarResource, GeothermalResource, @@ -15,6 +15,9 @@ from rex.utilities.utilities import unstupify_path +logger = logging.getLogger(__name__) + + class MultiH5(BaseDatasetIterable): """ Class to handle multiple h5 file Resources @@ -59,8 +62,9 @@ def __getitem__(self, dset): h5 = self._h5_map[path] ds = h5[dset] else: - raise ValueError('{} is invalid must be one of: {}' - .format(dset, self.datasets)) + msg = f'{dset} is invalid must be one of: {self.datasets}' + logger.error(msg) + raise ValueError(msg) return ds @@ -126,7 +130,7 @@ def _get_dsets(h5_path): unique_dsets = [] shared_dsets = [] try: - with h5py.File(h5_path, mode='r') as f: + with Resource.open_file(h5_path, mode='r') as f: for dset in Resource._get_datasets(f): if dset not in ['meta', 'time_index', 'coordinates']: unique_dsets.append(dset) @@ -134,6 +138,7 @@ def _get_dsets(h5_path): shared_dsets.append(dset) except Exception as e: msg = ('Could not read file: "{}"'.format(h5_path)) + logger.error(msg) raise IOError(msg) from e return unique_dsets, shared_dsets @@ -183,7 +188,7 @@ def _map_file_instances(h5_files): """ h5_map = {} for f_path in h5_files: - h5_map[f_path] = h5py.File(f_path, mode='r') + h5_map[f_path] = Resource.open_file(f_path, mode='r') return h5_map @@ -218,6 +223,7 @@ def _preflight_check(self): if bad_files: msg = ("The following files' coordinates and time-index do not " "match:\n{}".format(bad_files)) + logger.error(msg) raise ResourceRuntimeError(msg) def close(self): @@ -277,6 +283,7 @@ def _get_h5_files(h5_path): msg = ('h5_path must be a unix shell style pattern with ' 'wildcard * in order to find files, but received ' 'directory specification: {}'.format(h5_path)) + logger.error(msg) raise FileInputError(msg) file_paths = glob(h5_path) @@ -284,6 +291,7 @@ def _get_h5_files(h5_path): if not any(file_paths): msg = ('Could not find any file paths with pattern: {}' .format(h5_path)) + logger.error(msg) raise FileInputError(msg) return h5_path, file_paths @@ -430,6 +438,7 @@ def _init_multi_h5(h5_source, check_files=False): else: msg = ('Cannot initialize MultiH5 from {}, expecting a path or a ' 'list of .h5 file paths'.format(type(h5_source))) + logger.error(msg) raise ResourceRuntimeError(msg) return multi_h5 diff --git a/rex/renewable_resource.py b/rex/renewable_resource.py index 0b088295..e592940c 100644 --- a/rex/renewable_resource.py +++ b/rex/renewable_resource.py @@ -113,7 +113,9 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -126,7 +128,8 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -173,7 +176,9 @@ def __init__(self, h5_file, mode='r', unscale=True, str_decode=True, Parameters ---------- h5_file : str - Path to .h5 resource file + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) mode : str, optional Mode to instantiate h5py.File instance, by default 'r' unscale : bool @@ -193,7 +198,8 @@ def __init__(self, h5_file, mode='r', unscale=True, str_decode=True, option has no effect if data is available at multiple hub-heights. hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -709,7 +715,9 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -722,7 +730,8 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -845,7 +854,9 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -858,7 +869,8 @@ def preload_SAM(cls, h5_file, sites, unscale=True, str_decode=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -1456,7 +1468,9 @@ def preload_SAM(cls, h5_file, sites, hub_heights, unscale=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -1471,7 +1485,8 @@ def preload_SAM(cls, h5_file, sites, hub_heights, unscale=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -1601,7 +1616,9 @@ def preload_SAM(cls, h5_file, sites, depths, unscale=True, str_decode=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -1616,7 +1633,8 @@ def preload_SAM(cls, h5_file, sites, depths, unscale=True, str_decode=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None diff --git a/rex/resource.py b/rex/resource.py index 4875fca6..615292a7 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -4,6 +4,7 @@ """ import os from abc import ABC, abstractmethod +import logging from warnings import warn import dateutil @@ -17,6 +18,9 @@ from rex.utilities.utilities import check_tz, get_lat_lon_cols +logger = logging.getLogger(__name__) + + class BaseDatasetIterable(ABC): """Base class for file that is iterable over datasets. """ @@ -214,6 +218,7 @@ def _check_slice(ds_slice): msg = ('shape mismatch: indexing arrays could not be ' 'broadcast together with shapes {}' .format(['({},)'.format(ln) for ln in list_len])) + logger.error(msg) raise IndexError(msg) list_len = list_len[0] else: @@ -507,6 +512,7 @@ def _extract_ds_slice(self, ds_slice): 'limits, especially if you are using an NREL ' 'developer API key. For more details, see: ' 'https://nrel.github.io/rex/misc/examples.hsds.html') + logger.error(msg) raise ResourceRuntimeError(msg) from e # check to see if idx_slice needs to be applied @@ -610,7 +616,9 @@ def __init__(self, h5_file, mode='r', unscale=True, str_decode=True, Parameters ---------- h5_file : str - Path to .h5 resource file + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) mode : str, optional Mode to instantiate h5py.File instance, by default 'r' unscale : bool, optional @@ -624,29 +632,21 @@ def __init__(self, h5_file, mode='r', unscale=True, str_decode=True, Group within .h5 resource file to open, by default None hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None """ self.h5_file = h5_file - if hsds: - if mode != 'r': - raise OSError('Cannot write to files accessed vias HSDS!') - - import h5pyd - if hsds_kwargs is None: - hsds_kwargs = {} - self._h5 = h5pyd.File(self.h5_file, mode='r', use_cache=False, - **hsds_kwargs) - else: - try: - self._h5 = h5py.File(self.h5_file, mode=mode) - except Exception as e: - msg = ('Could not open file in mode "{}": "{}"' - .format(mode, self.h5_file)) - raise OSError(msg) from e + try: + self._h5 = self.open_file(h5_file, mode=mode, hsds=hsds, + hsds_kwargs=hsds_kwargs) + except Exception as e: + msg = f'Could not open file in mode "{mode}": "{h5_file}"' + logger.error(msg) + raise OSError(msg) from e self._group = group self._unscale = unscale @@ -695,6 +695,7 @@ def __getitem__(self, keys): out = self.get_SAM_df(site) # pylint: disable=E1111 else: msg = "Can only extract SAM DataFrame for a single site" + logger.error(msg) raise ResourceRuntimeError(msg) else: @@ -840,7 +841,9 @@ def meta(self): if 'meta' in self.h5: self._meta = self._get_meta('meta', slice(None)) else: - raise ResourceKeyError("'meta' is not a valid dataset") + msg = "'meta' is not a valid dataset" + logger.error(msg) + raise ResourceKeyError(msg) return self._meta @@ -858,7 +861,9 @@ def time_index(self): self._time_index = self._get_time_index('time_index', slice(None)) else: - raise ResourceKeyError("'time_index' is not a valid dataset!") + msg = "'time_index' is not a valid dataset!" + logger.error(msg) + raise ResourceKeyError(msg) return self._time_index @@ -1098,14 +1103,63 @@ def open_dataset(self, ds_name): Resource for open resource dataset """ if ds_name not in self.datasets: - raise ResourceKeyError('{} not in {}' - .format(ds_name, self.datasets)) + msg = '{} not in {}'.format(ds_name, self.datasets) + logger.error(msg) + raise ResourceKeyError(msg) ds = ResourceDataset(self.h5[ds_name], scale_attr=self.SCALE_ATTR, add_attr=self.ADD_ATTR, unscale=self._unscale) return ds + @staticmethod + def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): + """ + """ + + if file_path.startswith('/hsds/') or hsds: + if mode != 'r': + msg = 'Cannot write to files accessed vias HSDS!' + logger.error(msg) + raise OSError(msg) + + try: + import h5pyd + except Exception as e: + msg = (f'Tried to open hsds file path: "{file_path}" with ' + 'h5pyd but could not import, try `pip install h5pyd`') + logger.error(msg) + raise ImportError(msg) from e + + if hsds_kwargs is None: + hsds_kwargs = {} + + file = h5pyd.File(file_path, mode='r', use_cache=False, + **hsds_kwargs) + + if file_path.startswith('s3://'): + if mode != 'r': + msg = 'Cannot write to files accessed vias HSDS!' + logger.error(msg) + raise OSError(msg) + try: + import fsspec + except Exception as e: + msg = (f'Tried to open s3 file path: "{file_path}" with ' + 'fsspec but could not import, try ' + '`pip install fsspec s3fs`') + logger.error(msg) + raise ImportError(msg) from e + + s3f = fsspec.open(file_path, mode='rb', anon=True, + default_fill_cache=False) + file = h5py.File(s3f.open(), mode=mode) + + else: + file = h5py.File(file_path, mode=mode) + + return file + def get_attrs(self, dset=None): """ Get h5 attributes either from file or dataset @@ -1209,7 +1263,9 @@ def get_meta_arr(self, rec_name, rows=slice(None)): if self._str_decode and np.issubdtype(meta_arr.dtype, np.bytes_): meta_arr = np.char.decode(meta_arr, encoding='utf-8') else: - raise ResourceKeyError("'meta' is not a valid dataset") + msg = "'meta' is not a valid dataset" + logger.error(msg) + raise ResourceKeyError(msg) return meta_arr @@ -1320,6 +1376,7 @@ def get_SAM_df(self, site): """ msg = ('Method to retrieve SAM dataframe not implemented for vanilla ' 'Resource handler. Use an NSRDB or WTK handler instead.') + logger.error(msg) raise NotImplementedError(msg) def _get_ds(self, ds_name, ds_slice): @@ -1341,8 +1398,9 @@ def _get_ds(self, ds_name, ds_slice): If unscale, returned in native units else in scaled units """ if ds_name not in self.datasets: - raise ResourceKeyError('{} not in {}' - .format(ds_name, self.datasets)) + msg = '{} not in {}'.format(ds_name, self.datasets) + logger.error(msg) + raise ResourceKeyError(msg) ds = self.h5[ds_name] ds_slice = parse_slice(ds_slice) @@ -1383,6 +1441,7 @@ def _get_ds_with_repeated_values(self, ds, ds_name, ds_slice): "the length of the meta and/or index, set the shape of " "{!r} to be 2-dimensional (current shape: {!r}), or use a " "1-dimensional slice.".format(ds_name, ds.shape)) + logger.error(msg) raise ResourceRuntimeError(msg) if ds.shape == ti_shape: @@ -1396,6 +1455,7 @@ def _get_ds_with_repeated_values(self, ds, ds_name, ds_slice): "update the length of ({0!r}) to match either the meta or " "index, or use a 1-dimensional slice." .format(ds_name, ds.shape, meta_shape, ti_shape)) + logger.error(msg) raise ResourceRuntimeError(msg) def _get_ds_with_spatial_repeat(self, ds, ds_name, ds_slice): @@ -1523,7 +1583,9 @@ def preload_SAM(cls, h5_file, sites, tech, unscale=True, str_decode=True, Parameters ---------- h5_file : str - h5_file to extract resource from + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) sites : list List of sites to be provided to SAM (sites is synonymous with gids aka spatial indices) @@ -1538,7 +1600,8 @@ def preload_SAM(cls, h5_file, sites, tech, unscale=True, str_decode=True, Group within .h5 resource file to open hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -1750,7 +1813,9 @@ def __init__(self, h5_file, unscale=True, str_decode=True, group=None, Parameters ---------- h5_file : str - Path to .h5 resource file + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) unscale : bool, optional Boolean flag to automatically unscale variables on extraction, by default True @@ -1762,7 +1827,8 @@ def __init__(self, h5_file, unscale=True, str_decode=True, group=None, Group within .h5 resource file to open, by default None hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None From c57d2d580da6bdb507267e7fbb33f79f6970e1ed Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 13:59:38 -0700 Subject: [PATCH 02/13] added s3/fsspec tests --- tests/s3_tests.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/s3_tests.py diff --git a/tests/s3_tests.py b/tests/s3_tests.py new file mode 100644 index 00000000..5b7ca482 --- /dev/null +++ b/tests/s3_tests.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +""" +pytests for accessing NREL .h5 files on the cloud via hsds/h5pyd + +Note that this file cannot be named "test_*.py" because it is run with a +separate github action that sets up a local hsds server before running the +test. +""" +import numpy as np +from rex import NSRDB, WindResource + + +def test_nsrdb(): + """Test retrieving NSRDB data""" + with NSRDB("s3://nrel-pds-nsrdb/current/nsrdb_1998.h5") as res: + dsets = res.dsets + ghi = res['ghi', 0:10, 0] + assert isinstance(dsets, list) + assert isinstance(ghi, np.ndarray) + + +def test_wtk(): + """Test retrieving WTK data""" + fp = 's3://nrel-pds-wtk/conus/v1.0.0/wtk_conus_2007.h5' + with WindResource(fp) as res: + dsets = res.dsets + ws = res['windspeed_88m', 0:10, 0] + assert isinstance(dsets, list) + assert isinstance(ws, np.ndarray) + + +def test_sup3rcc(): + """Test retrieving sup3rcc data""" + fp = ('s3://nrel-pds-sup3rcc/conus_ecearth3_ssp585_r1i1p1f1/v0.1.0/' + 'sup3rcc_conus_ecearth3_ssp585_r1i1p1f1_trh_2059.h5') + with WindResource(fp) as res: + dsets = res.dsets + temp = res['temperature_2m', 0:10, 0] + assert isinstance(dsets, list) + assert isinstance(temp, np.ndarray) From eaf7ae20a5b61d1cecef3a20df0b46d66d1bd49c Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 13:59:54 -0700 Subject: [PATCH 03/13] added github action test for s3 files --- .github/workflows/s3_tests.yml | 38 ++++++++++++++++++++++++++++++++++ setup.py | 3 ++- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/s3_tests.yml diff --git a/.github/workflows/s3_tests.yml b/.github/workflows/s3_tests.yml new file mode 100644 index 00000000..a93674da --- /dev/null +++ b/.github/workflows/s3_tests.yml @@ -0,0 +1,38 @@ +name: h5pyd tests + +on: pull_request + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.10"] + include: + - os: ubuntu-latest + python-version: 3.9 + - os: ubuntu-latest + python-version: 3.8 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install pytest + pip install pytest-cov + pip install -e .[s3] + + - name: Run tests + shell: bash + run: | + pytest -v tests/s3_tests.py diff --git a/setup.py b/setup.py index 6a4a0583..b69b6deb 100644 --- a/setup.py +++ b/setup.py @@ -89,7 +89,8 @@ def run(self): extras_require={ "test": test_requires, "dev": test_requires + dev_requires, - "hsds": ["hsds>=0.8.4"] + "hsds": ["hsds>=0.8.4"], + "s3": ['fsspec', 's3fs'], }, cmdclass={"develop": PostDevelopCommand}, ) From b68a55c60eb7c4df71c26b0c4c631f0810c736ce Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:00:06 -0700 Subject: [PATCH 04/13] updated examples to reference fsspec as easiest with zero setup --- examples/NREL_Data/README.rst | 54 +++++++++++++++++++++++------------ examples/fsspec/README.rst | 15 ++++------ 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/examples/NREL_Data/README.rst b/examples/NREL_Data/README.rst index fb766b16..c454aeb3 100644 --- a/examples/NREL_Data/README.rst +++ b/examples/NREL_Data/README.rst @@ -26,7 +26,7 @@ Definitions - ``h5pyd`` - The python library that provides the HDF REST interface to NREL data hosted on the cloud. This allows for the public to access small parts of large cloud-hosted datasets. See the `h5pyd `_ library for more details. - ``hsds`` - The highly scalable data service (HSDS) that we recommend to access small chunks of very large cloud-hosted NREL datasets. See the `hsds `_ library for more details. - ``meta`` - The ``dataset`` in an NREL h5 file that contains information about the spatial axis. This is typically a `pandas DataFrame `_ with columns such as "latitude", "longitude", "state", etc... The DataFrame is typically converted to a records array for storage in an h5 ``dataset``. The length of the meta data should match the length of axis 1 of a 2D spatiotemporal ``dataset``. - - ``S3`` - Amazon Simple Storage Service (S3) is a basic cloud file storage system we use to store raw .h5 files in their full volume. Downloading files directly from S3 may not be the easiest way to access the data because each file tends to be multiple terabytes. Instead, you can stream small chunks of the files via HSDS. + - ``S3`` - Amazon Simple Storage Service (S3) is a basic cloud file storage system we use to store raw .h5 files in their full volume. Downloading files directly from S3 may not be the easiest way to access the data because each file tends to be multiple terabytes. Instead, you can stream small chunks of the files via HSDS. - ``scale_factor`` - We frequently scale data by a multiplicative factor, round the data to integer precision, and store the data in integer arrays. The ``scale_factor`` is an attribute associated with the relevant h5 ``dataset`` that defines the multiplicative factor required to unscale the data from integer storage to the original physical units. - ``time_index`` - The ``dataset`` in an NREL h5 file that contains information about the temporal axis. This is typically a `pandas DatetimeIndex `_ that has been converted to a string array for storage in an h5 ``dataset``. The length of this ``dataset`` should match the length of axis 0 of a 2D spatiotemporal ``dataset``. @@ -56,20 +56,30 @@ This datasets directory should not be confused with a ``dataset`` from an h5 file. When using the ``rex`` examples below, update the file paths with the relevant -NREL HPC file paths in ``/datasets/`` and set ``hsds=False``. +NREL HPC file paths in ``/datasets/``. Data Location - External Users ------------------------------ -If you are not at NREL, the easiest way to access this data is via HSDS. These -files are massive and downloading the full files would crash your computer. -HSDS provides a solution to stream small chunks of the data to your laptop or -server for just the time or space domain you're interested in. +If you are not at NREL, you can't just download these files. They are massive +and downloading the full files would crash your computer. the easiest way to +access this data is probably with ``fsspec``, which allows you to access files +directly on S3 with only one additional installation and no server setup. +However, this method is slow. The most performant method is via ``HSDS``. +``HSDS`` provides a solution to stream small chunks of the data to your laptop +or server for just the time or space domain you're interested in. + +See `this docs page `_ +for easy (but slow) access of the source .h5 files on s3 with ``fsspec``. See `this docs page `_ for instructions on how to set up HSDS and then continue on to the Data Access Examples section below. +There is also an experiment with using `zarr +`_, but the examples below +may not work with these utilities and the zarr example is not regularly tested. + To find relevant HSDS files, you can use HSDS and h5pyd to explore the NREL public data directory listings. For example, if you are running an HSDS local server, you can use the CLI utility ``hsls``, for example, run: ``$ hsls @@ -77,23 +87,29 @@ server, you can use the CLI utility ``hsls``, for example, run: ``$ hsls thing. In a python kernel, ``import h5pyd`` and then run: ``print(list(h5pyd.Folder('/nrel/')))`` to list the ``/nrel/`` directory. -The `Open Energy Data Initiative (OEDI) `_ -is also invaluable in finding energy-relevant public datasets that are not -necessarily spatiotemporal h5 data. +Note that raw NREL .h5 data files are hosted on AWS S3. In contrast, the files +on HSDS are not real "files". They are just domains that you can access with +h5pyd or rex tools to stream small chunks of the files stored on S3. The +multi-terabyte .h5 files on S3 would be incredibly cumbersome to access +otherwise. -Note that raw NREL .h5 data files are hosted on AWS S3. In contrast, the files on HSDS are not real "files". They are just domains that you can access with h5pyd or rex tools to stream small chunks of the files stored on S3. The multi-terabyte .h5 files on S3 would be incredibly cumbersome to access otherwise. +The `Open Energy Data Initiative (OEDI) `_ +is also invaluable for finding the source s3 filepaths and for finding +energy-relevant public datasets that are not necessarily spatiotemporal h5 +data. -We have also experimented with external data access using `fsspec `_ and `zarr `_, but the examples below may not work with these utilities. Data Access Examples -------------------- -If you are on the NREL HPC, update the file paths in the examples below and set -``hsds=False``. +If you are on the NREL HPC, update the file paths with the relevant NREL HPC +file paths in ``/datasets/``. + If you are not at NREL, see the "Data Location - External Users" section above -for how to setup HSDS and how to find the files that you're interested in. Then -update the file paths to the files you want and keep ``hsds=True``. +for S3 instructions or for how to setup HSDS and how to find the files that +you're interested in. Then update the file paths to the files you want either +on HSDS or S3. The rex Resource Class ++++++++++++++++++++++ @@ -105,7 +121,7 @@ retrieve ``time_index`` and ``meta`` datasets in their native pandas datatypes. .. code-block:: python from rex import Resource - with Resource('/nrel/nsrdb/current/nsrdb_2020.h5', hsds=True) as res: + with Resource('/nrel/nsrdb/current/nsrdb_2020.h5') as res: ghi = res['ghi', :, 500] print(res.dsets) print(res.attrs['ghi']) @@ -131,7 +147,7 @@ windspeed is not available as a ``dataset``: .. code-block:: python from rex import WindResource - with WindResource('/nrel/wtk/conus/wtk_conus_2007.h5', hsds=True) as res: + with WindResource('/nrel/wtk/conus/wtk_conus_2007.h5') as res: ws88 = res['windspeed_88m', :, 1000] print(res.dsets) print(ws88) @@ -150,7 +166,7 @@ for a requested coordinate: .. code-block:: python from rex import ResourceX - with ResourceX('/nrel/wtk/conus/wtk_conus_2007.h5', hsds=True) as res: + with ResourceX('/nrel/wtk/conus/wtk_conus_2007.h5') as res: df = res.get_lat_lon_df('temperature_2m', (39.7407, -105.1686)) print(df) @@ -170,7 +186,7 @@ the System Advisor Model (SAM). For example, try: .. code-block:: python from rex import SolarX - with SolarX('/nrel/nsrdb/current/nsrdb_2020.h5', hsds=True) as res: + with SolarX('/nrel/nsrdb/current/nsrdb_2020.h5') as res: df = res.get_SAM_lat_lon((39.7407, -105.1686)) print(df) diff --git a/examples/fsspec/README.rst b/examples/fsspec/README.rst index a36d2908..319ae160 100644 --- a/examples/fsspec/README.rst +++ b/examples/fsspec/README.rst @@ -1,37 +1,34 @@ fsspec ====== -You can use ``fsspec`` to open NREL h5 resource files hosted on AWS S3 on your local computer. In our internal tests, this is slower than the `HSDS `_ and `zarr `_ examples, but is much easier to set up. This may be a good option for people outside of NREL trying to access small to medium amounts of NREL .h5 data in applications that are not sensitive to IO performance. +Filesystem utilities from ``fsspec`` enable users outside of NREL to open h5 resource files hosted on AWS S3 on your local computer. In our internal tests, this is slower than the `HSDS `_ and `zarr `_ examples, but as of ``rex`` version v0.2.92 it requires zero setup beyond installing ``rex`` and ``fsspec`` as described below. This may be a good option for people outside of NREL trying to access small to medium amounts of NREL .h5 data in applications that are not sensitive to IO performance. For more info on ``fsspec``, read the docs `here `_ Extra Requirements ------------------ -You may need some additional software beyond the rex requirements to run this example: +You may need some additional software beyond the basic ``rex`` install to run this example: .. code-block:: bash - pip install fsspec + pip install fsspec s3fs Code Example ------------ -To open an .h5 file hosted on AWS S3, follow the code example below. Here are some caveats to this approach: +To open an .h5 file hosted on AWS S3, simply use a path to an S3 resource.: -- Change ``fp`` to your desired AWS .h5 resource paths. +- Change ``fp`` to your desired AWS .h5 resource paths (find the s3 paths on `Open Energy Data Initiative (OEDI) `_). - Running this example on a laptop, it takes ~14 seconds to read the meta data, and another ~14 seconds to read the GHI timeseries. This may be faster when running on AWS services in the same region hosting the .h5 file. It is much slower when running on the NREL VPN. -- The ``s3f`` object works like a local .h5 filepath and can be passed to any of the ``rex`` resource handlers, which will handle all of the data scaling and formatting. .. code-block:: python import time - import fsspec from rex import Resource fp = "s3://nrel-pds-nsrdb/current/nsrdb_1998.h5" - s3f = fsspec.open(fp, mode='rb', anon=True, default_fill_cache=False) - res = Resource(s3f.open()) + res = Resource(fp) t0 = time.time() meta = res.meta From 6404202723170646e2b489bed99d44bf8dabec02 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:00:35 -0700 Subject: [PATCH 05/13] bump rex version for easier s3 access --- rex/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rex/version.py b/rex/version.py index 3d0eb7c4..f6f2c9b4 100644 --- a/rex/version.py +++ b/rex/version.py @@ -1,3 +1,3 @@ """rex Version number""" -__version__ = "0.2.91" +__version__ = "0.2.92" From 636e9e3bb01a604959be2fcd4f74c8733aced570 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:05:01 -0700 Subject: [PATCH 06/13] docstrings --- rex/resource.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/rex/resource.py b/rex/resource.py index 615292a7..35b9437f 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -1114,7 +1114,30 @@ def open_dataset(self, ds_name): @staticmethod def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): - """ + """Open a filepath to an h5, s3, or hsds nrel resource file with the + appropriate python object. + + Parameters + ---------- + file_path : str + String filepath to .h5 file to extract resource from. Can also + be a path to an HSDS file (starts with /nrel/) or S3 file + (starts with s3://) + mode : str, optional + Mode to instantiate h5py.File instance, by default 'r' + hsds : bool, optional + Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default + hsds_kwargs : dict, optional + Dictionary of optional kwargs for h5pyd, e.g., bucket, username, + password, by default None + + Returns + ------- + file : h5py.File | h5pyd.File + H5 file handler either opening the local file using h5py, or the + file on s3 using h5py and fsspec, or the file on HSDS using h5pyd. """ if file_path.startswith('/hsds/') or hsds: From f025b63a9995ec6a58ba37f006424e6751f0f165 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:05:41 -0700 Subject: [PATCH 07/13] update name of s3 fsspec tests --- .github/workflows/s3_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/s3_tests.yml b/.github/workflows/s3_tests.yml index a93674da..1d50c547 100644 --- a/.github/workflows/s3_tests.yml +++ b/.github/workflows/s3_tests.yml @@ -1,4 +1,4 @@ -name: h5pyd tests +name: s3 fsspec tests on: pull_request From bb83a9ec1729a493fe8bca33e89d3fc46401b08b Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:21:52 -0700 Subject: [PATCH 08/13] bug fixes on my terrible code --- rex/resource.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rex/resource.py b/rex/resource.py index 35b9437f..bb8fdae5 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -1140,9 +1140,9 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): file on s3 using h5py and fsspec, or the file on HSDS using h5pyd. """ - if file_path.startswith('/hsds/') or hsds: + if file_path.startswith('/nrel/') or hsds: if mode != 'r': - msg = 'Cannot write to files accessed vias HSDS!' + msg = 'Cannot write to files accessed via HSDS!' logger.error(msg) raise OSError(msg) @@ -1160,11 +1160,12 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): file = h5pyd.File(file_path, mode='r', use_cache=False, **hsds_kwargs) - if file_path.startswith('s3://'): + elif file_path.startswith('s3://'): if mode != 'r': - msg = 'Cannot write to files accessed vias HSDS!' + msg = 'Cannot write to files accessed via s3/fsspec!' logger.error(msg) raise OSError(msg) + try: import fsspec except Exception as e: From 984befdb22cc342c363fd6e4c4690c30f7b16f86 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 14:59:04 -0700 Subject: [PATCH 09/13] enable multi year resource handler to work with s3 files without any fancy stuff --- rex/multi_time_resource.py | 65 ++++++++++++++++++++++++++++++++++++-- rex/multi_year_resource.py | 36 ++++++++++++++------- rex/resource.py | 44 +++++++++++++++++++++++--- tests/s3_tests.py | 13 +++++++- 4 files changed, 139 insertions(+), 19 deletions(-) diff --git a/rex/multi_time_resource.py b/rex/multi_time_resource.py index f19023ad..4ac6558c 100644 --- a/rex/multi_time_resource.py +++ b/rex/multi_time_resource.py @@ -6,6 +6,7 @@ from glob import glob from itertools import chain from fnmatch import fnmatch +import logging import numpy as np import pandas as pd @@ -21,6 +22,9 @@ from rex.utilities.parse_keys import parse_keys, parse_slice +logger = logging.getLogger(__name__) + + class MultiTimeH5: """ Class to handle h5 Resources stored over multiple temporal files @@ -224,11 +228,23 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None): file_paths : list List of filepaths for this handler to handle. """ - import h5pyd + try: + import h5pyd + except Exception as e: + msg = (f'Tried to open hsds file path: "{h5_path}" with ' + 'h5pyd but could not import, try `pip install h5pyd`') + logger.error(msg) + raise ImportError(msg) from e if hsds_kwargs is None: hsds_kwargs = {} + if isinstance(h5_path, (list, tuple)): + msg = ('HSDS filepath must be a string, possibly with glob ' + 'pattern, but received list/tuple') + logger.error(msg) + raise TypeError(msg) + hsds_dir = os.path.dirname(h5_path) fn = os.path.basename(h5_path) @@ -237,12 +253,14 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None): 'directory name! The directory must be explicit but the ' 'filename can have wildcards. This HSDS h5_path input ' 'cannot be used: {}'.format(h5_path)) + logger.error(msg) raise FileNotFoundError(msg) if not fn: msg = ('h5_path must be a unix shell style pattern with ' 'wildcard * in order to find files, but received ' 'directory specification: {}'.format(h5_path)) + logger.error(msg) raise FileInputError(msg) with h5pyd.Folder(hsds_dir + '/', **hsds_kwargs) as f: @@ -251,6 +269,47 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None): return file_paths + @staticmethod + def _get_s3_file_paths(h5_path): + """ + Get a list of h5 filepaths matching the h5_path specification from s3 + + Parameters + ---------- + h5_path : str + Unix shell style pattern path with * wildcards to multi-file + resource file sets. Files must have the same coordinates + but can have different datasets or time indexes. + + Returns + ------- + file_paths : list + List of filepaths for this handler to handle. + """ + try: + import s3fs + except Exception as e: + msg = (f'Tried to open s3 file path: "{h5_path}" with ' + 'fsspec but could not import, try ' + '`pip install fsspec s3fs`') + logger.error(msg) + raise ImportError(msg) from e + + s3 = s3fs.S3FileSystem(anon=True) + + if isinstance(h5_path, (list, tuple)): + file_paths = [s3.glob(fp) for fp in h5_path] + file_paths = list(chain.from_iterable(file_paths)) + elif isinstance(h5_path, str): + file_paths = s3.glob(h5_path) + + # s3fs glob drops prefix for some reason + for i, fp in enumerate(file_paths): + if not fp.startswith('s3://'): + file_paths[i] = f's3://{fp}' + + return file_paths + @classmethod def _get_file_paths(cls, h5_path, hsds=False, hsds_kwargs=None): """ @@ -277,9 +336,11 @@ def _get_file_paths(cls, h5_path, hsds=False, hsds_kwargs=None): List of filepaths for this handler to handle. """ - if hsds: + if Resource.is_hsds_file(h5_path) or hsds: file_paths = cls._get_hsds_file_paths(h5_path, hsds_kwargs=hsds_kwargs) + elif Resource.is_s3_file(h5_path): + file_paths = cls._get_s3_file_paths(h5_path) elif isinstance(h5_path, (list, tuple)): file_paths = list(chain.from_iterable(glob(fp) for fp in h5_path)) for fp in file_paths: diff --git a/rex/multi_year_resource.py b/rex/multi_year_resource.py index 40c2fdac..585722a3 100644 --- a/rex/multi_year_resource.py +++ b/rex/multi_year_resource.py @@ -31,14 +31,16 @@ def __init__(self, h5_path, years=None, res_cls=Resource, hsds=False, h5_path : str Unix shell style pattern path with * wildcards to multi-file resource file sets. Files must have the same coordinates - but can have different datasets or time indexes. + but can have different datasets or time indexes. Can also be a path + on HSDS starting with /nrel/ or a path on s3 starting with s3:// years : list, optional List of integer years to access, by default None res_cls : obj Resource class to use to open and access resource data hsds : bool Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -416,7 +418,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, resource file sets. Files must have the same coordinates but can have different datasets or time indexes. Can also be an explicit list of multi time files, which themselves can - contain * wildcards. + contain * wildcards. Can also be a path on HSDS starting with + /nrel/ or a path on s3 starting with s3:// years : list, optional List of years to access, by default None unscale : bool @@ -428,7 +431,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, Resource handler to us to open individual .h5 files hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -469,7 +473,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, h5_path : str Unix shell style pattern path with * wildcards to multi-file resource file sets. Files must have the same coordinates - but can have different datasets or time indexes. + but can have different datasets or time indexes. Can also be a path + on HSDS starting with /nrel/ or a path on s3 starting with s3:// years : list, optional List of years to access, by default None unscale : bool @@ -479,7 +484,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, strings. Setting this to False will speed up the meta data read. hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -505,7 +511,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, resource file sets. Files must have the same coordinates but can have different datasets or time indexes. Can also be an explicit list of multi time files, which themselves can - contain * wildcards. + contain * wildcards. Can also be a path on HSDS starting with + /nrel/ or a path on s3 starting with s3:// years : list, optional List of years to access, by default None unscale : bool @@ -515,7 +522,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, strings. Setting this to False will speed up the meta data read. hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -541,7 +549,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, resource file sets. Files must have the same coordinates but can have different datasets or time indexes. Can also be an explicit list of multi time files, which themselves can - contain * wildcards. + contain * wildcards. Can also be a path on HSDS starting with + /nrel/ or a path on s3 starting with s3:// years : list, optional List of years to access, by default None unscale : bool @@ -551,7 +560,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, strings. Setting this to False will speed up the meta data read. hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None @@ -577,7 +587,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, resource file sets. Files must have the same coordinates but can have different datasets or time indexes. Can also be an explicit list of multi time files, which themselves can - contain * wildcards. + contain * wildcards. Can also be a path on HSDS starting with + /nrel/ or a path on s3 starting with s3:// years : list, optional List of years to access, by default None unscale : bool @@ -587,7 +598,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True, strings. Setting this to False will speed up the meta data read. hsds : bool, optional Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS - behind HSDS, by default False + behind HSDS, by default False. This is now redundant; file paths + starting with /nrel/ will be treated as hsds=True by default hsds_kwargs : dict, optional Dictionary of optional kwargs for h5pyd, e.g., bucket, username, password, by default None diff --git a/rex/resource.py b/rex/resource.py index bb8fdae5..211cca8d 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -1112,8 +1112,8 @@ def open_dataset(self, ds_name): return ds - @staticmethod - def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): + @classmethod + def open_file(cls, file_path, mode='r', hsds=False, hsds_kwargs=None): """Open a filepath to an h5, s3, or hsds nrel resource file with the appropriate python object. @@ -1140,7 +1140,7 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): file on s3 using h5py and fsspec, or the file on HSDS using h5pyd. """ - if file_path.startswith('/nrel/') or hsds: + if cls.is_hsds_file(file_path) or hsds: if mode != 'r': msg = 'Cannot write to files accessed via HSDS!' logger.error(msg) @@ -1160,7 +1160,7 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): file = h5pyd.File(file_path, mode='r', use_cache=False, **hsds_kwargs) - elif file_path.startswith('s3://'): + elif cls.is_s3_file(file_path): if mode != 'r': msg = 'Cannot write to files accessed via s3/fsspec!' logger.error(msg) @@ -1184,6 +1184,42 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None): return file + @staticmethod + def is_hsds_file(file_path): + """Parse one or more filepath to determine if it is hsds + + Parameters + ---------- + file_path : str | list + One or more file paths (only the first is parsed if multiple) + + Returns + ------- + is_hsds_file : bool + True if hsds + """ + if isinstance(file_path, (list, tuple)): + file_path = file_path[0] + return file_path.startswith('/nrel/') + + @staticmethod + def is_s3_file(file_path): + """Parse one or more filepath to determine if it is s3 + + Parameters + ---------- + file_path : str | list + One or more file paths (only the first is parsed if multiple) + + Returns + ------- + is_s3_file : bool + True if s3 + """ + if isinstance(file_path, (list, tuple)): + file_path = file_path[0] + return file_path.startswith('s3://') + def get_attrs(self, dset=None): """ Get h5 attributes either from file or dataset diff --git a/tests/s3_tests.py b/tests/s3_tests.py index 5b7ca482..650aa1f6 100644 --- a/tests/s3_tests.py +++ b/tests/s3_tests.py @@ -7,7 +7,7 @@ test. """ import numpy as np -from rex import NSRDB, WindResource +from rex import NSRDB, WindResource, MultiYearResource def test_nsrdb(): @@ -38,3 +38,14 @@ def test_sup3rcc(): temp = res['temperature_2m', 0:10, 0] assert isinstance(dsets, list) assert isinstance(temp, np.ndarray) + + +def test_multiyear(): + """Test retrieving multi year NSRDB data""" + files = ["s3://nrel-pds-nsrdb/current/nsrdb_199*.h5"] + with MultiYearResource(files) as res: + dsets = res.dsets + ghi = res['ghi', 0:10, 0] + assert res.shape[0] == 35040 # 2x years at 30min (1998 and 1999) + assert isinstance(dsets, list) + assert isinstance(ghi, np.ndarray) From 1c116538e3c63c7273fb73196073022fb4e4f259 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Fri, 20 Dec 2024 15:30:55 -0700 Subject: [PATCH 10/13] handle posixpath object parsing for hsds and s3 filepaths --- rex/resource.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rex/resource.py b/rex/resource.py index 211cca8d..f570c931 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -1200,7 +1200,7 @@ def is_hsds_file(file_path): """ if isinstance(file_path, (list, tuple)): file_path = file_path[0] - return file_path.startswith('/nrel/') + return str(file_path).startswith('/nrel/') @staticmethod def is_s3_file(file_path): @@ -1218,7 +1218,7 @@ def is_s3_file(file_path): """ if isinstance(file_path, (list, tuple)): file_path = file_path[0] - return file_path.startswith('s3://') + return str(file_path).startswith('s3://') def get_attrs(self, dset=None): """ From e01980980408b682d5f1d61d66e53bbb8fa4a213 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Mon, 23 Dec 2024 10:17:22 -0700 Subject: [PATCH 11/13] updated documentation throughout --- README.rst | 76 +++++++++++++++---------------- docs/source/index.rst | 10 ---- docs/source/misc/installation.rst | 7 --- examples/HSDS/README.rst | 21 ++++++++- examples/NREL_Data/README.rst | 28 +++++------- examples/fsspec/README.rst | 6 +-- rex/multi_time_resource.py | 5 +- rex/resource.py | 5 +- 8 files changed, 76 insertions(+), 82 deletions(-) diff --git a/README.rst b/README.rst index 06cd5c93..52df3530 100644 --- a/README.rst +++ b/README.rst @@ -33,34 +33,25 @@ Welcome to The REsource eXtraction (rex) tool! .. inclusion-intro -rex command line tools -====================== - -- `rex `_ -- `NSRDBX `_ -- `WINDX `_ -- `US-wave `_ -- `WaveX `_ -- `MultiYearX `_ -- `rechunk `_ -- `temporal-stats `_ -- `wind-rose `_ - -Using Eagle Env -=============== - -If you would like to run `rex` on Eagle (NREL's HPC) you can use a pre-compiled -conda env: - -.. code-block:: bash - - conda activate /shared-projects/rev/modulefiles/conda/envs/rev/ - -or - -.. code-block:: bash - - source activate /shared-projects/rev/modulefiles/conda/envs/rev/ +What is rex? +============= +``rex`` stands for **REsource eXtraciton** tool. + +``rex`` enables the efficient and scalable extraction, manipulation, and +computation with NRELs flagship renewable resource datasets such as: the Wind +Integration National Dataset (WIND Toolkit), the National Solar Radiation +Database (NSRDB), the Ocean Surface Wave Hindcast (US Wave) Data, and the +High-resolution downscaled climate change data (Sup3rCC). + +To get started accessing NREL's datasets, see the primer on `NREL Renewable +Energy Resource Data +`_ or the +`installation instructions `_. + +You might also want to check out the basic `Resource Class +`_ that +can be used to efficiently query NREL data, or our various `example use cases +`_. Installing rex ============== @@ -78,12 +69,13 @@ Option 1: Install from PIP or Conda (recommended for analysts): 2. Activate directory: ``conda activate rex`` -3. Install rex: - 1) ``pip install NREL-rex`` or - 2) ``conda install nrel-rex --channel=nrel`` +3. Basic ``rex`` install: + 1) ``pip install NREL-rex`` + 2) or ``conda install nrel-rex --channel=nrel`` - - NOTE: If you install using conda and want to use `HSDS `_ - you will also need to install h5pyd manually: ``pip install h5pyd`` +4. Install for users outside of NREL that want to access data via HSDS or S3 as per the instructions `here `_: + 1) ``pip install NREL-rex[s3]`` for easy no-setup direct access of the data on S3 via ``fsspec`` as per `this example `_ + 2) or ``pip install NREL-rex[hsds]`` for more performant access of the data on HSDS with slightly more setup as per `this example `_ Option 2: Clone repo (recommended for developers) ------------------------------------------------- @@ -109,11 +101,15 @@ Option 2: Clone repo (recommended for developers) - ``WINDX`` - ``US-wave`` -Recommended Citation -==================== - -Update with current version and DOI: +rex command line tools +====================== -Michael Rossol, Grant Buster. The REsource Extraction Tool (rex). -https://github.com/NREL/rex (version v0.2.43), 2021. -https://doi.org/10.5281/zenodo.4499033. +- `rex `_ +- `NSRDBX `_ +- `WINDX `_ +- `US-wave `_ +- `WaveX `_ +- `MultiYearX `_ +- `rechunk `_ +- `temporal-stats `_ +- `wind-rose `_ diff --git a/docs/source/index.rst b/docs/source/index.rst index 5f52cf81..54db3014 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,14 +10,4 @@ rex documentation ******************* -What is rex? -============= -rex stands for **REsource eXtraciton** tool. - -rex enables the efficient and scalable extraction, manipulation, and -computation with NRELs flagship renewable resource datasets: -the Wind Integration National Dataset (WIND Toolkit), and the National Solar -Radiation Database (NSRDB) - .. include:: ../../README.rst - :start-after: inclusion-intro diff --git a/docs/source/misc/installation.rst b/docs/source/misc/installation.rst index 8b60dfa8..32082cc4 100644 --- a/docs/source/misc/installation.rst +++ b/docs/source/misc/installation.rst @@ -5,13 +5,6 @@ Installation :start-after: Installing rex :end-before: Recommended Citation -Usage on Eagle -============== - -.. include:: ../../../README.rst - :start-after: Using Eagle Env - :end-before: Installing rex - Command Line Tools ================== diff --git a/examples/HSDS/README.rst b/examples/HSDS/README.rst index ec9b36b7..1b3b0ee8 100644 --- a/examples/HSDS/README.rst +++ b/examples/HSDS/README.rst @@ -1,7 +1,26 @@ Highly Scalable Data Service (HSDS) =================================== -`The Highly Scalable Data Service (HSDS) `_ is a cloud-optimized solution for storing and accessing HDF5 files, e.g. the NREL wind and solar datasets. You can access NREL data via HSDS in a few ways. Read below to find out more. +`The Highly Scalable Data Service (HSDS) +`_ is a +cloud-optimized solution for storing and accessing HDF5 files, e.g. the NREL +wind and solar datasets. You can access NREL data via HSDS in a few ways. Read +below to find out more. + +Note that raw NREL .h5 data files are hosted on AWS S3. In contrast, the files +on HSDS are not real "files". They are just domains that you can access with +h5pyd or rex tools to stream small chunks of the files stored on S3. The +multi-terabyte .h5 files on S3 would be incredibly cumbersome to access +otherwise. + +Extra Requirements +------------------ + +You may need some additional software beyond the basic ``rex`` install to run this example: + +.. code-block:: bash + + pip install NREL-rex[hsds] NREL Developer API ------------------ diff --git a/examples/NREL_Data/README.rst b/examples/NREL_Data/README.rst index c454aeb3..e94a6190 100644 --- a/examples/NREL_Data/README.rst +++ b/examples/NREL_Data/README.rst @@ -70,29 +70,24 @@ However, this method is slow. The most performant method is via ``HSDS``. or server for just the time or space domain you're interested in. See `this docs page `_ -for easy (but slow) access of the source .h5 files on s3 with ``fsspec``. +for easy (but slow) access of the source .h5 files on s3 with ``fsspec`` that +requires basically zero setup. To find relevant S3 files, you can explore the +S3 directory structure on `OEDI `_ or +with the `AWS CLI `_ See `this docs page `_ for -instructions on how to set up HSDS and then continue on to the Data Access -Examples section below. +instructions on how to set up HSDS for more performant data access that +requires a bit of setup. To find relevant HSDS files, you can use HSDS and +h5pyd to explore the NREL public data directory listings. For example, if you +are running an HSDS local server, you can use the CLI utility ``hsls``, for +example, run: ``$ hsls /nrel/`` or ``$ hsls /nrel/nsrdb/v3/``. You can also use +h5pyd to do the same thing. In a python kernel, ``import h5pyd`` and then run: +``print(list(h5pyd.Folder('/nrel/')))`` to list the ``/nrel/`` directory. There is also an experiment with using `zarr `_, but the examples below may not work with these utilities and the zarr example is not regularly tested. -To find relevant HSDS files, you can use HSDS and h5pyd to explore the NREL -public data directory listings. For example, if you are running an HSDS local -server, you can use the CLI utility ``hsls``, for example, run: ``$ hsls -/nrel/`` or ``$ hsls /nrel/nsrdb/v3/``. You can also use h5pyd to do the same -thing. In a python kernel, ``import h5pyd`` and then run: -``print(list(h5pyd.Folder('/nrel/')))`` to list the ``/nrel/`` directory. - -Note that raw NREL .h5 data files are hosted on AWS S3. In contrast, the files -on HSDS are not real "files". They are just domains that you can access with -h5pyd or rex tools to stream small chunks of the files stored on S3. The -multi-terabyte .h5 files on S3 would be incredibly cumbersome to access -otherwise. - The `Open Energy Data Initiative (OEDI) `_ is also invaluable for finding the source s3 filepaths and for finding energy-relevant public datasets that are not necessarily spatiotemporal h5 @@ -105,7 +100,6 @@ Data Access Examples If you are on the NREL HPC, update the file paths with the relevant NREL HPC file paths in ``/datasets/``. - If you are not at NREL, see the "Data Location - External Users" section above for S3 instructions or for how to setup HSDS and how to find the files that you're interested in. Then update the file paths to the files you want either diff --git a/examples/fsspec/README.rst b/examples/fsspec/README.rst index 319ae160..4566f753 100644 --- a/examples/fsspec/README.rst +++ b/examples/fsspec/README.rst @@ -12,14 +12,14 @@ You may need some additional software beyond the basic ``rex`` install to run th .. code-block:: bash - pip install fsspec s3fs + pip install NREL-rex[s3] Code Example ------------ -To open an .h5 file hosted on AWS S3, simply use a path to an S3 resource.: +To open an .h5 file hosted on AWS S3, simply use a path to an S3 resource with any of the ``rex`` file handlers: -- Change ``fp`` to your desired AWS .h5 resource paths (find the s3 paths on `Open Energy Data Initiative (OEDI) `_). +- Change ``fp`` to your desired AWS .h5 resource paths (find the s3 paths on `OEDI `_ or with the `AWS CLI `_). - Running this example on a laptop, it takes ~14 seconds to read the meta data, and another ~14 seconds to read the GHI timeseries. This may be faster when running on AWS services in the same region hosting the .h5 file. It is much slower when running on the NREL VPN. .. code-block:: python diff --git a/rex/multi_time_resource.py b/rex/multi_time_resource.py index 4ac6558c..8f9b290b 100644 --- a/rex/multi_time_resource.py +++ b/rex/multi_time_resource.py @@ -232,7 +232,8 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None): import h5pyd except Exception as e: msg = (f'Tried to open hsds file path: "{h5_path}" with ' - 'h5pyd but could not import, try `pip install h5pyd`') + 'h5pyd but could not import, try ' + '`pip install NREL-rex[hsds]`') logger.error(msg) raise ImportError(msg) from e @@ -291,7 +292,7 @@ def _get_s3_file_paths(h5_path): except Exception as e: msg = (f'Tried to open s3 file path: "{h5_path}" with ' 'fsspec but could not import, try ' - '`pip install fsspec s3fs`') + '`pip install NREL-rex[s3]`') logger.error(msg) raise ImportError(msg) from e diff --git a/rex/resource.py b/rex/resource.py index f570c931..e2b8d919 100644 --- a/rex/resource.py +++ b/rex/resource.py @@ -1150,7 +1150,8 @@ def open_file(cls, file_path, mode='r', hsds=False, hsds_kwargs=None): import h5pyd except Exception as e: msg = (f'Tried to open hsds file path: "{file_path}" with ' - 'h5pyd but could not import, try `pip install h5pyd`') + 'h5pyd but could not import, try ' + '`pip install NREL-rex[hsds]`') logger.error(msg) raise ImportError(msg) from e @@ -1171,7 +1172,7 @@ def open_file(cls, file_path, mode='r', hsds=False, hsds_kwargs=None): except Exception as e: msg = (f'Tried to open s3 file path: "{file_path}" with ' 'fsspec but could not import, try ' - '`pip install fsspec s3fs`') + '`pip install NREL-rex[s3]`') logger.error(msg) raise ImportError(msg) from e From 8ee0fbb563f343f7672b4850ba70af7ca15a2c5b Mon Sep 17 00:00:00 2001 From: Grant Buster Date: Thu, 26 Dec 2024 11:57:42 -0700 Subject: [PATCH 12/13] Update rex/multi_time_resource.py Co-authored-by: Paul Pinchuk --- rex/multi_time_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rex/multi_time_resource.py b/rex/multi_time_resource.py index 8f9b290b..7c59c509 100644 --- a/rex/multi_time_resource.py +++ b/rex/multi_time_resource.py @@ -242,7 +242,7 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None): if isinstance(h5_path, (list, tuple)): msg = ('HSDS filepath must be a string, possibly with glob ' - 'pattern, but received list/tuple') + f'pattern, but received list/tuple: {h5_path}') logger.error(msg) raise TypeError(msg) From 17820a2bb910d38532a5124fa14aa67e9d6a5ab2 Mon Sep 17 00:00:00 2001 From: Grant Buster Date: Thu, 26 Dec 2024 11:57:49 -0700 Subject: [PATCH 13/13] Update examples/NREL_Data/README.rst Co-authored-by: Paul Pinchuk --- examples/NREL_Data/README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/NREL_Data/README.rst b/examples/NREL_Data/README.rst index e94a6190..295639a3 100644 --- a/examples/NREL_Data/README.rst +++ b/examples/NREL_Data/README.rst @@ -62,7 +62,7 @@ Data Location - External Users ------------------------------ If you are not at NREL, you can't just download these files. They are massive -and downloading the full files would crash your computer. the easiest way to +and downloading the full files would crash your computer. The easiest way to access this data is probably with ``fsspec``, which allows you to access files directly on S3 with only one additional installation and no server setup. However, this method is slow. The most performant method is via ``HSDS``.