Skip to content

Commit

Permalink
Merge pull request #410 from knaaptime/lodesfips
Browse files Browse the repository at this point in the history
data defs; fix gadm
  • Loading branch information
knaaptime authored Nov 9, 2024
2 parents 88ae3c7 + 4479a9b commit 7819029
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 46 deletions.
42 changes: 32 additions & 10 deletions geosnap/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ def __dir__(self):
"ltdb",
"msa_definitions",
"msas",
"naics_definitions",
"ncdb",
"nces",
"nlcd_definitions",
"seda",
"states",
"show_data_dir",
Expand All @@ -109,7 +111,7 @@ def show_data_dir(self, verbose=True):
return self.data_dir

def lodes_codebook(self):
"""_summary_
"""Return a table of descriptive variable names for the LODES data
Returns
-------
Expand All @@ -121,7 +123,7 @@ def lodes_codebook(self):
)

def bea_regions(self):
"""Table that maps states to their respective BEA regions
"""Return a table that maps states to their respective BEA regions
Returns
-------
Expand Down Expand Up @@ -217,16 +219,20 @@ def seda(
"long",
"poolsub",
], "`pool` argument must be either 'pool', 'long', or 'poolsub'"
assert standardize in [
"gcs",
"cs",
], "`standardize` argument must be either 'cs' for cohort-standardized or 'gcs' for grade-cohort-standardized"
assert (
standardize
in [
"gcs",
"cs",
]
), "`standardize` argument must be either 'cs' for cohort-standardized or 'gcs' for grade-cohort-standardized"

if pooling == "poolsub":
fn = f"seda_{level}_{pooling}_{standardize}_4.1_corrected"
fn = f"seda_{level}_{pooling}_{standardize}_5.0"
else:
fn = f"seda_{level}_{pooling}_{standardize}_4.1"
fn = f"seda_{level}_{pooling}_{standardize}_5.0"
local_path = pathlib.Path(self.data_dir, "seda", f"{fn}.parquet")
remote_path = f"https://stacks.stanford.edu/file/druid:xv742vh9296/{fn}.csv"
remote_path = f"https://stacks.stanford.edu/file/druid:cs829jn7849/{fn}.csv"
msg = (
"Streaming data from SEDA archive at <https://exhibits.stanford.edu/data/catalog/db586ns4974>.\n"
"Use `geosnap.io.store_seda()` to store the data locally for better performance"
Expand Down Expand Up @@ -596,7 +602,8 @@ def msa_definitions(self):
return pd.read_csv(
os.path.join(
os.path.dirname(os.path.abspath(__file__)), "io/msa_definitions.csv"
)
),
converters={"stcofips": str},
)

def ltdb(self):
Expand Down Expand Up @@ -645,3 +652,18 @@ def codebook(self):
return pd.read_csv(
os.path.join(os.path.dirname(os.path.abspath(__file__)), "io/variables.csv")
)

def nlcd_definitions(self):
"""Table of NLCD land classification system definitions.
Returns
-------
pandas.DataFrame
table that stores variable names, definitions, and formulas.
"""
return pd.read_csv(
os.path.join(
os.path.dirname(os.path.abspath(__file__)), "io/nlcd_definitions.csv"
)
)
2 changes: 1 addition & 1 deletion geosnap/analyze/_cluster_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def gaussian_mixture(

# selection routine from
# https://plot.ly/scikit-learn/plot-gmm-selection/
lowest_bic = np.infty
lowest_bic = np.inf
bic = []
maxn = max_clusters + 1
n_components_range = range(1, maxn)
Expand Down
11 changes: 4 additions & 7 deletions geosnap/io/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ def get_lodes(
state_fips=state_fips,
county_fips=county_fips,
msa_fips=msa_fips,
fips=fips,
fips=allfips,
data=gdf,
)
if isinstance(boundary, gpd.GeoDataFrame):
Expand Down Expand Up @@ -608,19 +608,16 @@ def get_lodes(


def _msa_to_county(datastore, msa_fips):
msa_defs = datastore.msa_definitions()
if msa_fips:
pr_metros = set(
datastore.msa_definitions()[
datastore.msa_definitions()["CBSA Title"].str.contains("PR")
]["CBSA Code"].tolist()
msa_defs[msa_defs["CBSA Title"].str.contains("PR")]["CBSA Code"].tolist()
)
if msa_fips in pr_metros:
raise Exception(
"geosnap does not yet include built-in data for Puerto Rico"
)
msa_counties = datastore.msa_definitions()[
datastore.msa_definitions()["CBSA Code"] == msa_fips
]["stcofips"].tolist()
msa_counties = msa_defs[msa_defs["CBSA Code"] == msa_fips]["stcofips"].tolist()

else:
msa_counties = None
Expand Down
31 changes: 7 additions & 24 deletions geosnap/io/gadm.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
"""Utilities for fetching data from GADM."""

import os
import tempfile

import geopandas as gpd


def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3):
def get_gadm(code, level=0):
"""Collect data from GADM as a geodataframe.
Parameters
Expand All @@ -15,14 +12,6 @@ def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3):
three character ISO code for a country
level : int, optional
which geometry level to collect, by default 0
use_fsspec : bool
whether to use the `fsspec` library
gpkg : bool
whether to read from a geopackage or shapefile. If True,
geopackage will be read; shapefile if False. Ignored if using fsspec
n_retries : int optional
number of retries in case read fails from direct stream from GADM.
Ignored if using fsspec.
Returns
-------
Expand All @@ -44,15 +33,9 @@ def get_gadm(code, level=0, use_fsspec=True, gpkg=True, n_retries=3):
with this method would always returns the layer with index 0 in the geopackage file).
"""
code = code.upper()
import fsspec

with tempfile.TemporaryDirectory() as temp_path:
with fsspec.open(
f"simplecache::zip://*.gpkg::https://biogeo.ucdavis.edu/data/gadm3.6/gpkg/gadm36_{code}_gpkg.zip",
simplecache={"cache_storage": temp_path},
):
gdf = gpd.read_file(
os.path.join(temp_path, os.listdir(temp_path)[0]),
layer=f"gadm36_{code}_{level}",
)
return gdf

gdf = gpd.read_file(
f"https://geodata.ucdavis.edu/gadm/gadm4.1/gpkg/gadm41_{code}.gpkg",
layer=f"ADM_ADM_{level}",
)
return gdf
4 changes: 2 additions & 2 deletions geosnap/io/networkio.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,13 @@ def get_network_from_gdf(
}

impedance = "length"
graph = ox.graph_from_polygon(gdf.unary_union, network_type=network_type)
graph = ox.graph_from_polygon(gdf.union_all(), network_type=network_type)
if add_travel_times:
graph = ox.add_edge_speeds(graph, default_speeds)
graph = ox.add_edge_travel_times(graph)
impedance = "travel_time"

n, e = ox.utils_graph.graph_to_gdfs(graph)
n, e = ox.graph_to_gdfs(graph)
if output_crs is not None:
n = _reproject_osm_nodes(n, input_crs=4326, output_crs=output_crs)
e = e.to_crs(output_crs)
Expand Down
17 changes: 17 additions & 0 deletions geosnap/io/nlcd_definitions.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
code,color,classification,description
11,"70,107,159",Open Water,"areas of open water, generally with less than 25% cover of vegetation or soil."
12,"209,222,248",Perennial Snow/Ice,"areas characterized by a perennial cover of ice and/or snow, generally greater than 25% of total cover."
21,"222,197,197","Developed, Open Space","areas with a mixture of some constructed materials, but mostly vegetation in the form of lawn grasses. Impervious surfaces account for less than 20% of total cover. These areas most commonly include large-lot single-family housing units, parks, golf courses, and vegetation planted in developed settings for recreation, erosion control, or aesthetic purposes."
22,"217,146,130","Developed, Low Intensity",areas with a mixture of constructed materials and vegetation. Impervious surfaces account for 20% to 49% percent of total cover. These areas most commonly include single-family housing units
23,"235,0,0","Developed, Medium Intensity",areas with a mixture of constructed materials and vegetation. Impervious surfaces account for 50% to 79% of the total cover. These areas most commonly include single-family housing units.
24,"171,0,0",Developed High Intensity,"highly developed areas where people reside or work in high numbers. Examples include apartment complexes, row houses and commercial/industrial. Impervious surfaces account for 80% to 100% of the total cover"
31,"179,172,159",Barren Land (Rock/Sand/Clay),"areas of bedrock, desert pavement, scarps, talus, slides, volcanic material, glacial debris, sand dunes, strip mines, gravel pits and other accumulations of earthen material. Generally, vegetation accounts for less than 15% of total cover."
41,"104,171,95",Deciduous Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. More than 75% of the tree species shed foliage simultaneously in response to seasonal change."
42,"28,95,44",Evergreen Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. More than 75% of the tree species maintain their leaves all year. Canopy is never without green foliage."
43,"181,197,143",Mixed Forest,"areas dominated by trees generally greater than 5 meters tall, and greater than 20% of total vegetation cover. Neither deciduous nor evergreen species are greater than 75% of total tree cover."
52,"204,184,121",Shrub/Scrub,"areas dominated by shrubs; less than 5 meters tall with shrub canopy typically greater than 20% of total vegetation. This class includes true shrubs, young trees in an early successional stage or trees stunted from environmental conditions"
71,"223,223,194",Grassland/Herbaceous,"areas dominated by gramanoid or herbaceous vegetation, generally greater than 80% of total vegetation. These areas are not subject to intensive management such as tilling, but can be utilized for grazing."
81,"220,217,57",Pasture/Hay,"areas of grasses, legumes, or grass-legume mixtures planted for livestock grazing or the production of seed or hay crops, typically on a perennial cycle. Pasture/hay vegetation accounts for greater than 20% of total vegetation."
82,"171,108,40",Cultivated Crops,"areas used for the production of annual crops, such as corn, soybeans, vegetables, tobacco, and cotton, and also perennial woody crops such as orchards and vineyards. Crop vegetation accounts for greater than 20% of total vegetation. This class also includes all land being actively tilled."
90,"184,217,235",Woody Wetlands,areas where forest or shrubland vegetation accounts for greater than 20% of vegetative cover and the soil or substrate is periodically saturated with or covered with water.
95,"108,159,184",Emergent Herbaceous Wetlands,areas where perennial herbaceous vegetation accounts for greater than 80% of vegetative cover and the soil or substrate is periodically saturated with or covered with water.
11 changes: 9 additions & 2 deletions geosnap/tests/test_isochrones.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ def test_isos_from_gdf_shapely():
)
assert_almost_equal(t.area.astype(float).round(8).tolist()[0], 0.00012474)


@pytest.mark.skipif(
sys.platform.startswith("win"),
reason="skipping test on windows because of dtype issue",
)
def test_network_constructor():
tracts = get_acs(DataStore(), county_fips='48301', level='tract', years=2015)
walk_net = get_network_from_gdf(tracts)
Expand All @@ -102,8 +105,12 @@ def test_isos_with_edges():
)
print(alpha.area.round(8))
# this will grow depending on the size of the OSM network when tested...
assert alpha.area.round(8).iloc[0] == 0.00026001
assert alpha.area.round(8).iloc[0] >= 0.00036433

@pytest.mark.skipif(
sys.platform.startswith("win"),
reason="skipping test on windows because of dtype issue",
)
def test_project_network():
tracts = get_acs(DataStore(), county_fips='48301', level='tract', years=2015)
walk_net = get_network_from_gdf(tracts)
Expand Down

0 comments on commit 7819029

Please sign in to comment.