-
Notifications
You must be signed in to change notification settings - Fork 0
/
jobs.py
125 lines (93 loc) · 4.41 KB
/
jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from gevent import monkey; monkey.patch_all()
from BeautifulSoup import BeautifulSoup
from datetime import datetime, date, timedelta, time
from model import *
from mongoengine import connect
import gevent
import itertools
import logging
import re
import urllib2
log = logging.getLogger("jobs")
def untguiden():
log.debug("Getting activities from untguiden")
activities = []
base_url = "http://untguiden.teknomedia.se"
dates = (date.today() + timedelta(days=i) for i in range(1))
for act_date in dates:
eventlist_url = "%s/Default.aspx?action=search&c=3443&d=%s" % (base_url, act_date)
list_doc = urllib2.urlopen(eventlist_url).read()
list_soup = BeautifulSoup(list_doc)
event_urls = (item.find("a")["href"] for item in list_soup.findAll("div", "Event"))
for url in event_urls:
item_url = "%s%s" % (base_url, url)
item_doc = urllib2.urlopen(item_url).read()
item_soup = BeautifulSoup(item_doc)
act = Activity(city="uppsala")
act.source = "untguiden"
header = item_soup.find("h1")
act.name = header.find(text=True).strip()
act.description = re.sub(r"^%s" % act.name, "", header.parent.text, count=1).strip()
url = item_soup.find("strong", text="Webb")
if url:
act.url = url.parent.findNextSibling("a")["href"].strip()
email = item_soup.find("strong", text="E-post")
if email:
act.email = email.parent.findNextSibling("a").find(text=True).strip()
phone = item_soup.find("strong", text="Kontakt")
if phone:
act.phone = re.sub(r"^Kontakt: ", "", phone.parent.parent.text).strip()
act.starts_at = datetime.datetime.combine(act_date, time())
activities.append(act)
return activities
def destinationuppsala():
log.debug("Getting activities from destionation uppsala")
activities = []
base_url = "http://www.destinationuppsala.se"
desclabel_re = re.compile("Description$")
phonelabel_re = re.compile("Phone$")
urllabel_re = re.compile("Web$")
emaillabel_re = re.compile("Email$")
dates = (date.today() + timedelta(days=i) for i in range(1))
for act_date in dates:
eventlist_url = "%s/DynPage.aspx?id=9582&search=true&start=%s&end=%s&cat=9&txt=" % (base_url, act_date, act_date)
list_doc = urllib2.urlopen(eventlist_url).read()
list_soup = BeautifulSoup(list_doc)
event_urls = (item["href"] for item in list_soup.findAll("a", "evListObject"))
for url in event_urls:
item_url = "%s/%s" % (base_url, url)
item_doc = urllib2.urlopen(item_url).read()
item_soup = BeautifulSoup(item_doc, fromEncoding="utf-8")
act = Activity(city="uppsala")
act.source = "destinationuppsala"
act.name = item_soup.find("font", "head1").find(text=True).strip()
desc = item_soup.find("span", id=desclabel_re)
if desc:
desc_text = desc.find(text=True)
if desc_text:
act.description = desc_text.strip()
phone = item_soup.find("span", id=phonelabel_re)
if phone:
act.phone = phone.text.replace("Tfn:", "").strip()
url = item_soup.find("span", id=urllabel_re)
if url:
act.url = url.find("a")["href"].strip()
email = item_soup.find("span", id=emaillabel_re)
if email:
act.email = email.find("a").find(text=True).strip()
act.starts_at = datetime.datetime.combine(act_date, time())
activities.append(act)
return activities
if __name__ == '__main__':
logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG)
jobs = [gevent.spawn(j) for j in (untguiden, destinationuppsala)]
connect("uppsalabarn")
deleted = [a.delete() for a in Activity.old()]
log.debug("Deleted %d old activities" % len(deleted))
gevent.joinall(jobs)
current_activities = set(Activity.objects())
parsed_activities = set(itertools.chain(*(job.value for job in jobs)))
new_activities = parsed_activities - current_activities
log.debug("Current act length: %s" % len(current_activities))
log.debug("Parsed act length: %s" % len(parsed_activities))
[act.save() for act in new_activities]