-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWHOCoronaDataScraper.py
103 lines (83 loc) · 3.37 KB
/
WHOCoronaDataScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import PyPDF2
from datetime import date, datetime, timedelta
import pycountry
import requests
from io import BytesIO
link = 'https://www.who.int/docs/default-source/coronaviruse/situation-reports/{}-sitrep-{}-covid-19.pdf' # Link to the pdf
firstDate = datetime.strptime("21-1-2020", "%d-%m-%Y").date() # Date of first publication
countries=[f.name for f in pycountry.countries] # List of all countries in pycountry
# TODO: Figure out how to read every country. NLTK.Word_tokenizer?
def pdf_exists(link):
response = requests.get(link, stream = True)
if response.status_code == 404:
return False
else:
return True
def get_pdf_stream(link):
response = requests.get(link)
pdf_file = BytesIO(response.content)
return pdf_file
def prepare_WHO_grabberdata(date): # Takes a date and gives a formatted date en number back to use in the link
issuenumber = (date-firstDate).days + 1
if date.month < 10:
str_month = "0"+str(date.month)
else:
str_month = str(date.month)
strToday = str(date.year)+str_month+str(date.day)
return strToday, issuenumber
def get_latest_WHO_Data():
# Date and issue of today
today = datetime.now().date()
strToday, issuenumber_today = prepare_WHO_grabberdata(today)
# Date and issue of yesterday
yesterday = today - timedelta(1)
strYesterday, issuenumber_yesterday = prepare_WHO_grabberdata(yesterday)
data = [] # Empty list of data
headers = ['country','total confirmed cases','total deaths','total new deaths'] # List of headers
searchlink = link.format(strToday, str(issuenumber_today)) # Creating the link and checking if issue of today exists
if pdf_exists(searchlink):
# get filestream
pdfFileObj = get_pdf_stream(searchlink)
else:
searchlink = link.format(strYesterday, str(issuenumber_yesterday))
# get filestream
pdfFileObj = get_pdf_stream(searchlink)
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#### Convert data from pdf into list of dictionaries ####
for pageNum in range(0,pdfReader.getNumPages()):
page = pdfReader.getPage(pageNum)
splitpage = page.extractText().split("\n")
header_place = 0
datarow = {}
for x in range(0, len(splitpage)):
if splitpage[x] in countries:
datarow[headers[header_place]] = splitpage[x]
header_place += 1
try:
if header_place > 0:
number = int(splitpage[x])
datarow[headers[header_place]] = number
if header_place == len(headers)-1:
data.append(datarow)
header_place = 0
datarow = {}
else:
header_place += 1
except ValueError as e:
e = e
pdfFileObj.close()
print('Done')
return data, headers
def sort_WHO_data_high_to_low(data):
sorted_data = sorted(data, key=lambda i:i['cases'], reverse=True)
for x in range(1,len(sorted_data)+1):
sorted_data[x-1]['index'] = x
return sorted_data
def sort_WHO_data_low_to_high(data):
sorted_data = sorted(data, key=lambda i:i['cases'])
for x in range(1, len(sorted_data)+1):
sorted_data[x]['index'] = x
return sorted_data
data, headers = get_latest_WHO_Data()
for country in data:
print(country)