From b13b874d1f49c1e6d8ff8ed9ad78f88e474a31ab Mon Sep 17 00:00:00 2001 From: Matt Amos Date: Fri, 12 Apr 2019 19:45:30 +0100 Subject: [PATCH 1/2] Update WOF import pipeline. Previously, we would look at the metadata CSVs, load all the referenced GeoJSON files over HTTP, parse them and put them into PostgreSQL. The hosted `pgdump` version was just dumped from a manual import of this data. Because we're now doing global builds, this data has grown stale. This change adds a script to download and parse the WOF "bundles", which are `tar.gz` files containing the GeoJSON files - so we download 4 files instead of thousands. Instead of loading this into a database, we dump the data out as a SQL file, ready to be imported at database setup time. The SQL dump is put into the `shapefiles.tar.gz` versioned static data asset, similar to Natural Earth and the OSMData land/water polygons. This means it's stable across releases, but we can update it as part of a regular asset rebuild. --- data/Makefile-prepare-data.jinja2 | 7 +- data/assets.yaml | 2 +- data/wof_snapshot.py | 135 ++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 data/wof_snapshot.py diff --git a/data/Makefile-prepare-data.jinja2 b/data/Makefile-prepare-data.jinja2 index 620e3b599..33cac60e8 100644 --- a/data/Makefile-prepare-data.jinja2 +++ b/data/Makefile-prepare-data.jinja2 @@ -11,8 +11,8 @@ upload: shapefiles shapefiles: shapefiles.tar.gz -shapefiles.tar.gz: {{ tgt_shapefile_zips }} - tar czf shapefiles.tar.gz {{ tgt_shapefile_zips }} +shapefiles.tar.gz: {{ tgt_shapefile_zips }} wof_snapshot.sql + tar czf shapefiles.tar.gz $^ download: {{ src_shapefile_zips }} @@ -53,6 +53,9 @@ download: {{ src_shapefile_zips }} {% endfor %} +wof_snapshot.sql: + python wof_snapshot.py + clean: rm -rf shapefiles.tar.gz {{ tgt_shapefile_zips }} {{ tgt_shapefile_wildcards }} {{ src_shapefile_zips }} {{ src_shapefile_wildcards }} diff --git a/data/assets.yaml b/data/assets.yaml index 5769c9811..c9f348eb0 100644 --- a/data/assets.yaml +++ b/data/assets.yaml @@ -1,5 +1,5 @@ bucket: tilezen-assets -datestamp: 20190326 +datestamp: 20190412 shapefiles: diff --git a/data/wof_snapshot.py b/data/wof_snapshot.py new file mode 100644 index 000000000..74493ccc3 --- /dev/null +++ b/data/wof_snapshot.py @@ -0,0 +1,135 @@ +from os.path import basename +from os.path import splitext +from os.path import join as path_join +from tilequeue.wof import Neighbourhood +from tilequeue.wof import NeighbourhoodFailure +from tilequeue.wof import NeighbourhoodMeta +from tilequeue.wof import create_neighbourhood_from_json +from tilequeue.wof import write_neighbourhood_data_to_file +import json +import tarfile +import requests +from tqdm import tqdm + + +def _parse_wof_id(s): + wof_id, ext = splitext(basename(s)) + assert ext == '.geojson' + return int(wof_id) + + +def _parse_neighbourhood(file_name, data, placetype, file_hash): + wof_id = _parse_wof_id(file_name) + meta = NeighbourhoodMeta(wof_id, placetype, None, file_hash, None) + json_data = json.loads(data) + n = create_neighbourhood_from_json(json_data, meta) + return n + + +class WOFArchiveReader(object): + """ + Collects WOF parsed data items (mostly neighbourhoods) from a series of + tar.gz "bundles" as distributed by WOF. + """ + + def __init__(self): + self.wof_items = [] + + def add_archive(self, archive, file_hash, count): + """ + Adds the GeoJSON files in the tar.gz archive to the list of wof_items. + + Displays a progress bar, with count being the expected number of items + in the tar.gz. + """ + + with tqdm(total=count) as pbar: + with tarfile.open(archive) as tar: + for info in tar: + if info.isfile() and info.name.endswith('.geojson'): + self._parse_file( + info.name, tar.extractfile(info).read(), file_hash) + pbar.update(1) + + def _parse_file(self, file_name, data, file_hash): + n_or_fail = _parse_neighbourhood(file_name, data, placetype, file_hash) + if isinstance(n_or_fail, Neighbourhood): + self.wof_items.append(n_or_fail) + elif isinstance(n_or_fail, NeighbourhoodFailure): + if n_or_fail.skipped or n_or_fail.funky or n_or_fail.superseded: + pass + else: + raise ValueError("Failed to parse neighbourhood: %s " + "(because: %s)" + % (n_or_fail.message, n_or_fail.reason)) + else: + raise ValueError("Unexpected %r" % (n_or_fail,)) + + +class tmpdownload(object): + """ + Downloads a file to a temporary location and yields its absolute path. Once + the scope exits, deletes the temporary file. + """ + + def __init__(self, url, expected_size): + import tempfile + self.tempdir = tempfile.mkdtemp() + + fname = url.split('/')[-1] + abs_fname = path_join(self.tempdir, fname) + + # see https://stackoverflow.com/questions/16694907/#16696317 + with requests.get(url, stream=True) as response: + response.raise_for_status() + + with tqdm(total=expected_size) as pbar: + with open(abs_fname, 'wb') as fh: + for chunk in response.iter_content(chunk_size=16384): + if chunk: + fh.write(chunk) + pbar.update(len(chunk)) + + self.abs_fname = abs_fname + + def __enter__(self): + return self.abs_fname + + def __exit__(self, type, value, traceback): + import shutil + shutil.rmtree(self.tempdir) + + +WOF_INVENTORY = 'https://dist.whosonfirst.org/bundles/inventory.json' +WOF_BUNDLE_PREFIX = 'https://dist.whosonfirst.org/bundles/' + + +if __name__ == '__main__': + inventory = requests.get(WOF_INVENTORY).json() + reader = WOFArchiveReader() + + for placetype in ('neighbourhood', 'macrohood', 'microhood', 'borough'): + fname = 'whosonfirst-data-%s-latest.tar.bz2' % (placetype,) + + matching = [item for item in inventory + if item['name_compressed'] == fname] + assert len(matching) == 1 + item = matching[0] + + version = item['last_updated'] + count = item['count'] + download_size = item['size_compressed'] + + print "Downloading %r with %d entries" % (placetype, count) + with tmpdownload(WOF_BUNDLE_PREFIX + fname, download_size) as fname: + print "Parsing WOF data" + reader.add_archive(fname, version, count) + + print "Writing output SQL" + with open('wof_snapshot.sql', 'w') as fh: + fh.write("COPY public.wof_neighbourhood (" + "wof_id, placetype, name, hash, n_photos, area, min_zoom, " + "max_zoom, is_landuse_aoi, label_position, geometry, " + "inception, cessation, is_visible, l10n_name) FROM stdin;\n") + write_neighbourhood_data_to_file(fh, reader.wof_items) + fh.write("\\.\n") diff --git a/requirements.txt b/requirements.txt index c873a1383..7259182d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,6 +20,7 @@ Shapely==1.6.2.post1 simplejson==3.12.0 six==1.11.0 StreetNames==0.1.5 +tqdm=4.31.1 Werkzeug==0.12.2 wsgiref==0.1.2 git+https://github.com/tilezen/tilequeue@master#egg=tilequeue From 7f7f4ce9b7c126341e1edbca83752efb80eafc03 Mon Sep 17 00:00:00 2001 From: Matt Amos Date: Fri, 12 Apr 2019 19:52:52 +0100 Subject: [PATCH 2/2] Fix typo in requirements.txt, should have been == for tqdm version. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7259182d9..2446fd68c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ Shapely==1.6.2.post1 simplejson==3.12.0 six==1.11.0 StreetNames==0.1.5 -tqdm=4.31.1 +tqdm==4.31.1 Werkzeug==0.12.2 wsgiref==0.1.2 git+https://github.com/tilezen/tilequeue@master#egg=tilequeue