forked from Parliament-in-Data/Federal-Parliament-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
document.py
215 lines (193 loc) · 8.79 KB
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from bs4 import BeautifulSoup
import parliament_parser
import requests
import dateparser
from activity import LegislativeActivity, QuestionActivity
import re
import json
from util import normalize_str
from os import path, makedirs
def extract_name(name: str):
match = re.match("(.+, .+) (\S+)$", name)
if match and match.group(1):
res = match.group(1)
res = res.replace(' CD&V -', '') # Fixes a bug caused by "het kartel"
if res[-1] == ',':
res = res[:-1]
return res
else:
return name
class ParliamentaryDocument:
def __init__(self, session, document_number):
self.session = session
self.document_number = document_number
self.descriptor = None
self.keywords = None
self.title = None
self.document_type = None
self.date = dateparser.parse(session.start)
self.authors = []
self._initialize()
self.session.documents[document_number] = self
self._register_activities()
def description_uri(self):
return f'https://www.dekamer.be/kvvcr/showpage.cfm?section=/flwb&language=nl&cfm=/site/wwwcfm/flwb/flwbn.cfm?lang=N&legislat={self.session.session}&dossierID={self.document_number}'
def uri(self):
return f'legislation/{self.document_number}.json'
def json(self, base_path, base_URI="/"):
result = {}
result['document_number'] = self.document_number
if self.document_type:
result['document_type'] = self.document_type
if self.title:
result['title'] = self.title
result['source'] = self.description_uri()
if not self.date:
self.date = dateparser.parse(self.session.start)
result['date'] = self.date.isoformat()
result['authors'] = [
f'{base_URI}{author.uri()}' for author in self.authors]
if self.descriptor:
result['descriptor'] = self.descriptor
if self.keywords:
result['keywords'] = self.keywords
base_path = path.join(base_path, "legislation")
makedirs(base_path, exist_ok=True)
with open(path.join(base_path, f'{self.document_number}.json'), 'w+') as fp:
json.dump(result, fp, ensure_ascii=False)
return f'{base_URI}{self.uri}'
def _initialize(self, retry=False):
page = requests.get(self.description_uri())
soup = BeautifulSoup(page.content, 'html.parser')
content = soup.find('div', {'id': 'Story'})
if (not content) or "not found" in content.get_text():
if retry:
return
else:
self._initialize(retry=True)
return
proposal_date = soup.find('td', text=re.compile('Indieningsdatum'))
if not proposal_date:
proposal_date = soup.find('td', text=re.compile('[0-9]+/[0-9]+/[0-9]+'))
if proposal_date:
self.date = dateparser.parse(proposal_date.get_text(), languages=['nl'])
else:
self.date = dateparser.parse(
proposal_date.parent.find_all('td')[-1].get_text(), languages=['nl'])
descriptor = soup.find(
'td', text=re.compile('Eurovoc-hoofddescriptor'))
if descriptor:
self.descriptor = descriptor.parent.find_all('td')[-1].get_text().split(' | ')
keywords = soup.find('td', text=re.compile('Eurovoc descriptoren'))
if keywords:
self.keywords = keywords.parent.find_all(
'td')[-1].get_text().split(' | ')
title = content.find('h4')
if title:
self.title = title.get_text().strip()
doc_type_row = [tag for tag in soup.find_all(
'td', {'class': "td1x"}) if 'Document type' in tag.get_text()]
self.document_type = doc_type_row[0].parent.find(
'td', {'class': 'td0x'}).find_all(text=True)[0][3:]
if self.document_type == 'WETSVOORSTEL':
authors = [tag for tag in soup.find_all(
'td', {'class': "td1x"}) if 'Auteur(s)' in tag.get_text()]
authors = authors[0].parent.find(
'td', {'class': 'td0x'}).find_all(text=True)
authors = [text.strip() for text in authors if (
not str(text).isspace()) and ', ' in text]
for name in authors:
name = normalize_str(name).decode()
if name in self.session.get_members_dict():
self.authors.append(self.session.get_members_dict()[name])
elif extract_name(name) in self.session.get_members_dict():
self.authors.append(self.session.get_members_dict()[
extract_name(name)])
else:
print("D:" + name)
def _register_activities(self):
if not self.authors:
return
for author in self.authors:
author.post_activity(LegislativeActivity(author, self.date, self))
class ParliamentaryQuestion:
def __init__(self, session, document_number: str):
from datetime import datetime
self.session = session
self.document_number = document_number
self.authors = []
self.title = None
self.responding_minister = None
self.date = dateparser.parse(session.start)
self._initialize()
self.session.questions[document_number] = self
self._register_activities()
def _register_activities(self):
if not self.authors:
return
for author in self.authors:
author.post_activity(QuestionActivity(author, self.date, self))
def uri(self):
return f'questions/{self.document_number}.json'
def json(self, base_path, base_URI="/"):
result = {}
result['document_number'] = self.document_number
result['title'] = self.title
if not self.date:
self.date = dateparser.parse(self.session.start)
result['date'] = self.date.isoformat()
result['source'] = self.description_uri()
if self.responding_minister:
result['responding_minister'] = self.responding_minister
result['responding_department'] = self.responding_department
result['authors'] = [
f'{base_URI}{author.uri()}' for author in self.authors]
base_path = path.join(base_path, "questions")
makedirs(base_path, exist_ok=True)
with open(path.join(base_path, f'{self.document_number}.json'), 'w+') as fp:
json.dump(result, fp, ensure_ascii=False)
return f'{base_URI}{self.uri}'
def description_uri(self):
return f'https://www.dekamer.be/kvvcr/showpage.cfm?section=inqo&language=nl&cfm=inqoXml.cfm?db=INQO&legislat={self.session.session}&dossierID=Q{self.document_number}'
def _initialize(self, retry=False):
page = requests.get(self.description_uri())
soup = BeautifulSoup(page.content, 'html.parser')
body = soup.find('body')
if (not body) or "not exist" in body.get_text():
if retry:
return
else:
self._initialize(retry=True)
return
authors = [tag for tag in soup.find_all(
'td') if 'Auteur(s)' in tag.get_text()]
if authors:
authors = authors[0].parent.find_all(
'td')[1].get_text().split('\n')
authors = [','.join(text.strip().split(
',')[:-1]) for text in authors if (not str(text).isspace()) and ', ' in text]
for name in authors:
name = normalize_str(name).decode()
if name in self.session.get_members_dict():
self.authors.append(self.session.get_members_dict()[name])
elif extract_name(name) in self.session.get_members_dict():
self.authors.append(self.session.get_members_dict()[
extract_name(name)])
else:
print("Q:" + name)
responding_minister_cell = soup.find(
'i', text=re.compile('Antwoordende minister'))
if responding_minister_cell:
self.responding_minister = responding_minister_cell.find_parent('tr').find_all('td')[
1].get_text().strip()[:-1]
self.responding_department = responding_minister_cell.find_parent('tr').find_next('tr').get_text().strip()
title = soup.find('i', text=re.compile('Titel'))
if title:
self.title = title.find_parent('tr').find_all('td')[
1].get_text().strip()
self.title = "\n".join(item.strip()
for item in self.title.split('\n') if item.strip())
date = soup.find('i', text=re.compile('Datum bespreking'))
if date:
self.date = dateparser.parse(
date.find_parent('tr').find_all('td')[1].get_text().strip(), languages=['nl'])