From c50f6df16c3545a1b243c4fcdefcd420b4836c8c Mon Sep 17 00:00:00 2001 From: macs1207 Date: Sat, 29 Feb 2020 01:48:21 +0800 Subject: [PATCH] Fix announcement parser --- src/crawler/school_announcements_crawler.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/crawler/school_announcements_crawler.py b/src/crawler/school_announcements_crawler.py index 8811dc7..53d5dd1 100644 --- a/src/crawler/school_announcements_crawler.py +++ b/src/crawler/school_announcements_crawler.py @@ -1,5 +1,6 @@ from lxml import etree import requests +import re from utils import error_code from utils import config @@ -30,9 +31,9 @@ def acad(page=0): if req.status_code == 200: req = req.json()['content'] - root = etree.HTML(req) - date = root.xpath('//*[@class="mdate before"]') + node = root.xpath('//*[@class="d-txt"]') + date = [node[i] for i in range(0, len(node), 3)] href = root.xpath('//*[@class="d-txt"]//a') base_id = page*15 @@ -42,7 +43,7 @@ def acad(page=0): 'info':{ 'id': base_id+index, 'title': href_data.attrib['title'], - 'date':date_time.text + 'date': re.search("([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))", date_time.text).group() } } for index, (date_time, href_data) in enumerate(zip(date, href))] return notification