From e6f416c5d551d090ddaa25e59b27150d5eda0356 Mon Sep 17 00:00:00 2001 From: Breyten Ernsting Date: Tue, 16 Jan 2024 15:04:29 +0100 Subject: [PATCH] Add woogle location scraper. See #145 --- backend/jodal/locations.py | 47 +++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/backend/jodal/locations.py b/backend/jodal/locations.py index 1d623e7..c6335b5 100644 --- a/backend/jodal/locations.py +++ b/backend/jodal/locations.py @@ -6,9 +6,11 @@ from pprint import pformat import hashlib from copy import deepcopy +from urllib.parse import urljoin import requests from elasticsearch.helpers import bulk +from lxml import etree from jodal.es import setup_elasticsearch from jodal.scrapers import MemoryMixin, ElasticsearchMixin, BaseScraper, ElasticsearchBulkMixin @@ -146,13 +148,56 @@ def transform(self, item): 'source': self.name }) + +class WoogleLocationScraper(MemoryMixin, BaseLocationScraper): + name = 'woogle' + url = 'https://doi.wooverheid.nl/?doi=nl&dim=publisher&category=Gemeente' + + def _sanatize_name(self, name): + return name.replace('Gemeente ', '') + + def fetch(self): + resp = requests.get(self.url, headers=self.headers) + html = etree.HTML(resp.content) + results = [] + + for r in html.xpath("//table//tr"): + #print(r) + try: + l = r.xpath('./td[1]/a/@href')[0] + except LookupError as e: + l = None + gl = urljoin(self.url, l) + gm = u''.join(r.xpath('./td[1]//text()')).strip() + name = u''.join(r.xpath('./td[2]//text()')) + count = u''.join(r.xpath('./td[3]//text()')).replace(',', '') + if not count: + count = '0' + results.append({ + 'url': gl, + 'code': gm, + 'name': name, + 'count': int(count) + }) + + return results + + def transform(self, item): + name = self._sanatize_name(item['name']) + return super(WoogleLocationScraper, self).transform({ + 'name': name, + 'id': item['code'], + 'source': self.name + }) + class LocationsScraperRunner(object): scrapers = [ PoliFlwLocationScraper, OpenspendingCountyLocationScraper, OpenspendingProvinceLocationScraper, OpenBesluitvormingLocationScraper, - CVDRLocationScraper + CVDRLocationScraper, + WoogleLocationScraper ] year = '2023'