From b13b874d1f49c1e6d8ff8ed9ad78f88e474a31ab Mon Sep 17 00:00:00 2001
From: Matt Amos <zerebubuth@gmail.com>
Date: Fri, 12 Apr 2019 19:45:30 +0100
Subject: [PATCH 1/2] Update WOF import pipeline.

Previously, we would look at the metadata CSVs, load all the referenced GeoJSON files over HTTP, parse them and put them into PostgreSQL. The hosted `pgdump` version was just dumped from a manual import of this data.

Because we're now doing global builds, this data has grown stale. This change adds a script to download and parse the WOF "bundles", which are `tar.gz` files containing the GeoJSON files - so we download 4 files instead of thousands. Instead of loading this into a database, we dump the data out as a SQL file, ready to be imported at database setup time.

The SQL dump is put into the `shapefiles.tar.gz` versioned static data asset, similar to Natural Earth and the OSMData land/water polygons. This means it's stable across releases, but we can update it as part of a regular asset rebuild.
---
 data/Makefile-prepare-data.jinja2 |   7 +-
 data/assets.yaml                  |   2 +-
 data/wof_snapshot.py              | 135 ++++++++++++++++++++++++++++++
 requirements.txt                  |   1 +
 4 files changed, 142 insertions(+), 3 deletions(-)
 create mode 100644 data/wof_snapshot.py

diff --git a/data/Makefile-prepare-data.jinja2 b/data/Makefile-prepare-data.jinja2
index 620e3b599..33cac60e8 100644
--- a/data/Makefile-prepare-data.jinja2
+++ b/data/Makefile-prepare-data.jinja2
@@ -11,8 +11,8 @@ upload: shapefiles
 
 shapefiles: shapefiles.tar.gz
 
-shapefiles.tar.gz: {{ tgt_shapefile_zips }}
-	tar czf shapefiles.tar.gz {{ tgt_shapefile_zips }}
+shapefiles.tar.gz: {{ tgt_shapefile_zips }} wof_snapshot.sql
+	tar czf shapefiles.tar.gz $^
 
 download: {{ src_shapefile_zips }}
 
@@ -53,6 +53,9 @@ download: {{ src_shapefile_zips }}
 
 {% endfor %}
 
+wof_snapshot.sql:
+	python wof_snapshot.py
+
 clean:
 	rm -rf shapefiles.tar.gz {{ tgt_shapefile_zips }} {{ tgt_shapefile_wildcards }} {{ src_shapefile_zips }} {{ src_shapefile_wildcards }}
 
diff --git a/data/assets.yaml b/data/assets.yaml
index 5769c9811..c9f348eb0 100644
--- a/data/assets.yaml
+++ b/data/assets.yaml
@@ -1,5 +1,5 @@
 bucket: tilezen-assets
-datestamp: 20190326
+datestamp: 20190412
 
 shapefiles:
 
diff --git a/data/wof_snapshot.py b/data/wof_snapshot.py
new file mode 100644
index 000000000..74493ccc3
--- /dev/null
+++ b/data/wof_snapshot.py
@@ -0,0 +1,135 @@
+from os.path import basename
+from os.path import splitext
+from os.path import join as path_join
+from tilequeue.wof import Neighbourhood
+from tilequeue.wof import NeighbourhoodFailure
+from tilequeue.wof import NeighbourhoodMeta
+from tilequeue.wof import create_neighbourhood_from_json
+from tilequeue.wof import write_neighbourhood_data_to_file
+import json
+import tarfile
+import requests
+from tqdm import tqdm
+
+
+def _parse_wof_id(s):
+    wof_id, ext = splitext(basename(s))
+    assert ext == '.geojson'
+    return int(wof_id)
+
+
+def _parse_neighbourhood(file_name, data, placetype, file_hash):
+    wof_id = _parse_wof_id(file_name)
+    meta = NeighbourhoodMeta(wof_id, placetype, None, file_hash, None)
+    json_data = json.loads(data)
+    n = create_neighbourhood_from_json(json_data, meta)
+    return n
+
+
+class WOFArchiveReader(object):
+    """
+    Collects WOF parsed data items (mostly neighbourhoods) from a series of
+    tar.gz "bundles" as distributed by WOF.
+    """
+
+    def __init__(self):
+        self.wof_items = []
+
+    def add_archive(self, archive, file_hash, count):
+        """
+        Adds the GeoJSON files in the tar.gz archive to the list of wof_items.
+
+        Displays a progress bar, with count being the expected number of items
+        in the tar.gz.
+        """
+
+        with tqdm(total=count) as pbar:
+            with tarfile.open(archive) as tar:
+                for info in tar:
+                    if info.isfile() and info.name.endswith('.geojson'):
+                        self._parse_file(
+                            info.name, tar.extractfile(info).read(), file_hash)
+                        pbar.update(1)
+
+    def _parse_file(self, file_name, data, file_hash):
+        n_or_fail = _parse_neighbourhood(file_name, data, placetype, file_hash)
+        if isinstance(n_or_fail, Neighbourhood):
+            self.wof_items.append(n_or_fail)
+        elif isinstance(n_or_fail, NeighbourhoodFailure):
+            if n_or_fail.skipped or n_or_fail.funky or n_or_fail.superseded:
+                pass
+            else:
+                raise ValueError("Failed to parse neighbourhood: %s "
+                                 "(because: %s)"
+                                 % (n_or_fail.message, n_or_fail.reason))
+        else:
+            raise ValueError("Unexpected %r" % (n_or_fail,))
+
+
+class tmpdownload(object):
+    """
+    Downloads a file to a temporary location and yields its absolute path. Once
+    the scope exits, deletes the temporary file.
+    """
+
+    def __init__(self, url, expected_size):
+        import tempfile
+        self.tempdir = tempfile.mkdtemp()
+
+        fname = url.split('/')[-1]
+        abs_fname = path_join(self.tempdir, fname)
+
+        # see https://stackoverflow.com/questions/16694907/#16696317
+        with requests.get(url, stream=True) as response:
+            response.raise_for_status()
+
+            with tqdm(total=expected_size) as pbar:
+                with open(abs_fname, 'wb') as fh:
+                    for chunk in response.iter_content(chunk_size=16384):
+                        if chunk:
+                            fh.write(chunk)
+                            pbar.update(len(chunk))
+
+        self.abs_fname = abs_fname
+
+    def __enter__(self):
+        return self.abs_fname
+
+    def __exit__(self, type, value, traceback):
+        import shutil
+        shutil.rmtree(self.tempdir)
+
+
+WOF_INVENTORY = 'https://dist.whosonfirst.org/bundles/inventory.json'
+WOF_BUNDLE_PREFIX = 'https://dist.whosonfirst.org/bundles/'
+
+
+if __name__ == '__main__':
+    inventory = requests.get(WOF_INVENTORY).json()
+    reader = WOFArchiveReader()
+
+    for placetype in ('neighbourhood', 'macrohood', 'microhood', 'borough'):
+        fname = 'whosonfirst-data-%s-latest.tar.bz2' % (placetype,)
+
+        matching = [item for item in inventory
+                    if item['name_compressed'] == fname]
+        assert len(matching) == 1
+        item = matching[0]
+
+        version = item['last_updated']
+        count = item['count']
+        download_size = item['size_compressed']
+
+        print "Downloading %r with %d entries" % (placetype, count)
+        with tmpdownload(WOF_BUNDLE_PREFIX + fname, download_size) as fname:
+            print "Parsing WOF data"
+            reader.add_archive(fname, version, count)
+
+    print "Writing output SQL"
+    with open('wof_snapshot.sql', 'w') as fh:
+        fh.write("COPY public.wof_neighbourhood ("
+                 "wof_id, placetype, name, hash, n_photos, area, min_zoom, "
+                 "max_zoom, is_landuse_aoi, label_position, geometry, "
+                 "inception, cessation, is_visible, l10n_name) FROM stdin;\n")
+        write_neighbourhood_data_to_file(fh, reader.wof_items)
+        fh.write("\\.\n")
diff --git a/requirements.txt b/requirements.txt
index c873a1383..7259182d9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,7 @@ Shapely==1.6.2.post1
 simplejson==3.12.0
 six==1.11.0
 StreetNames==0.1.5
+tqdm=4.31.1
 Werkzeug==0.12.2
 wsgiref==0.1.2
 git+https://github.com/tilezen/tilequeue@master#egg=tilequeue

From 7f7f4ce9b7c126341e1edbca83752efb80eafc03 Mon Sep 17 00:00:00 2001
From: Matt Amos <zerebubuth@gmail.com>
Date: Fri, 12 Apr 2019 19:52:52 +0100
Subject: [PATCH 2/2] Fix typo in requirements.txt, should have been == for
 tqdm version.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 7259182d9..2446fd68c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ Shapely==1.6.2.post1
 simplejson==3.12.0
 six==1.11.0
 StreetNames==0.1.5
-tqdm=4.31.1
+tqdm==4.31.1
 Werkzeug==0.12.2
 wsgiref==0.1.2
 git+https://github.com/tilezen/tilequeue@master#egg=tilequeue