-
Notifications
You must be signed in to change notification settings - Fork 0
/
calendar_extractor.py
69 lines (56 loc) · 2.32 KB
/
calendar_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bs4 import BeautifulSoup
import requests
import json
import re
import calendar
from datetime import date
class ExtractCalendar:
def __init__(self):
self.months = {month: index for index, month in enumerate(calendar.month_abbr) if month}
def extract_calendar(self, years, country):
holiday_information = {}
self.years = years
columns = None
for year in self.years:
columns, holiday_information[year] = self.extract_single_calendar(year, country)
print(columns)
return columns, holiday_information
def extract_single_calendar(self, year, country):
url = f"https://www.officeholidays.com/countries/{country}/{year}"
html_content = requests.get(url).text
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
gdp = soup.find_all("table", attrs={"class": "country-table"})
holiday_info = None
columns = None
holiday_info = []
for table in gdp:
body = table.find_all("tr")
head = body[0]
web_data = body[1:]
columns = []
for item in head.find_all("th"): # loop through all th elements
# convert the th elements to text and strip "\n"
item = (item.text).rstrip("\n")
# append the clean column name to headings
columns.append(item)
for row_num in range(len(web_data)): # A row at a time
row = [] # this will old entries for one row
i = 0
for row_item in web_data[row_num].find_all("td"): # loop through all row entries
aa = re.sub("(\xa0)|(\n)|,", "", row_item.text)
# print(row)
# print(aa)
if i == 1:
date_info = aa.split(" ")
aa = date(year=year, month=self.months[date_info[0]], day=int(date_info[1]))
row.append(aa)
i += 1
# append one row to all_rows
holiday_info.append(row)
# holiday_info[holidate.isoformat()] = row
print(columns)
print(holiday_info)
print('here')
print(holiday_info)
return columns, holiday_info