From c24b2313f4f9c5584df1b9f60232e36402c6844a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagur=20P=C3=A1ll=20Ammendrup?= Date: Tue, 7 Oct 2014 21:47:50 +0000 Subject: [PATCH] Moving to BeautifulSoup. Radio not ready yet. --- plugin.video.sarpur/sarpur/__init__.py | 2 +- plugin.video.sarpur/sarpur/cached.py | 19 ++++----- plugin.video.sarpur/sarpur/scraper.py | 59 ++++++++++++-------------- 3 files changed, 36 insertions(+), 44 deletions(-) diff --git a/plugin.video.sarpur/sarpur/__init__.py b/plugin.video.sarpur/sarpur/__init__.py index e7e50d6..c331c7a 100644 --- a/plugin.video.sarpur/sarpur/__init__.py +++ b/plugin.video.sarpur/sarpur/__init__.py @@ -4,7 +4,7 @@ import sys import xbmcaddon -ALWAYS_REFRESH = False +ALWAYS_REFRESH = True LOGGING_ENABLED = True BASE_URL = sys.argv[0] diff --git a/plugin.video.sarpur/sarpur/cached.py b/plugin.video.sarpur/sarpur/cached.py index 81cb4a5..3a98c30 100644 --- a/plugin.video.sarpur/sarpur/cached.py +++ b/plugin.video.sarpur/sarpur/cached.py @@ -20,28 +20,25 @@ def __init__(self): delta = datetime.now() - datetime.fromtimestamp(last_modified) if sarpur.ALWAYS_REFRESH or delta.days > 0: - showtree = self.update_showtree() - tabs = self.update_tabs() + self.update_showtree() + self.update_tabs() else: - tabs = json.load(file(TABFILE_LOCATION, 'rb')) - showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb')) + self.tabs = json.load(file(TABFILE_LOCATION, 'rb')) + self.showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb')) except OSError: if not os.path.exists(DATA_PATH): os.makedirs(DATA_PATH) - showtree = self.update_showtree() - tabs = self.update_tabs() + self.update_showtree() + self.update_tabs() except IOError: try: os.unlink(SHOWTREEFILE_LOCATION) os.unlink(TABFILE_LOCATION) except OSError: None - showtree = self.update_showtree() - tabs = self.update_tabs() - - self.tabs = tabs - self.showtree = showtree + self.update_showtree() + self.update_tabs() def update_tabs(self): "populate latest_groups" diff --git a/plugin.video.sarpur/sarpur/scraper.py b/plugin.video.sarpur/sarpur/scraper.py index 5c426c3..334ca6e 100644 --- a/plugin.video.sarpur/sarpur/scraper.py +++ b/plugin.video.sarpur/sarpur/scraper.py @@ -1,19 +1,14 @@ #!/usr/bin/env python # encoding: UTF-8 -import requests, re -from html5lib import treebuilders -from xml.etree import ElementTree +import re +import requests +from bs4 import BeautifulSoup def get_document(url): req = requests.get(url) - source = req.content - - builder = treebuilders.getTreeBuilder("etree", ElementTree) - doc = html5lib.parse(source, - treebuilder=builder, - namespaceHTMLElements=False) + doc = BeautifulSoup(req.content) return doc @@ -23,14 +18,14 @@ def get_episodes(url): doc = get_document(url) #Generic look - for episode in doc.xpath("//a[contains(@title, 'Spila')]"): + for episode in doc.findall(".//a[contains(@title, 'Spila')]"): episodes.append((episode.text, episode.get('href'))) if episodes: return episodes #"Special" page - for episode in doc.xpath("//div[contains(@class,'mm-mynd')]"): + for episode in doc.findall(".//div[contains(@class,'mm-mynd')]"): episode_date = episode.getparent().find('span').text url = u'http://www.ruv.is{0}'.format(episode.find('a').attrib.get('href')) episodes.append((episode_date, url)) @@ -39,10 +34,10 @@ def get_episodes(url): def get_tabs(): doc = get_document("http://www.ruv.is/sarpurinn") - xpathstring = "//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a" + findallstring = ".//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a" tabs = [] - for hyperlink in doc.xpath(xpathstring): + for hyperlink in doc.findall(findallstring): tabs.append((hyperlink.text, hyperlink.get('href'))) return tabs @@ -51,17 +46,17 @@ def get_showtree(): doc = get_document('http://dagskra.ruv.is/thaettir/') showtree = [] - for i, channel in enumerate(doc.xpath("//div[@style]")): + #.//div[@style] + for i, channel in enumerate(doc.findall("div", style=True)): channel_name = channel.find("h1").text showtree.append({"name": channel_name, "categories": []}) - for group in channel.find("div").iterchildren(): + for group in channel.find("div"): if group.tag == 'h2': showtree[i]["categories"].append( {"name":group.text, "shows":[]}) elif group.tag == 'div': - for show in group.findall("div"): - hyperlink = show.find("a") + for hyperlink in group.findall("div/a"): show_info = (hyperlink.text, hyperlink.get('href')) showtree[i]["categories"][-1]['shows'].append(show_info) return showtree @@ -71,23 +66,23 @@ def get_stream_info(page_url): doc = get_document(page_url) #Get urls for the swf player and playpath - params = doc.xpath('//param') + params = doc.findall('.//param') swfplayer = 'http://ruv.is{0}'.format(params[0].get('value')) details = params[1].get('value') playpath = re.search('streamer=(.*?)&(file=.*?)&stre', details).group(2) # Get the url of the actual rtmp stream - source_tags = doc.xpath('//source') + source_tags = doc.findall('.//source') if source_tags and source_tags[0].attrib.get('src'): #RÚV rtmp_url = source_tags[0].get('src') else: #RÁS 1 & 2 # The ip address of the stream server is returned in another page - cache_url = doc.xpath("//script[contains(@src, 'load.cache.is')]")[0].get('src') + cache_url = doc.findall(".//script[contains(@src, 'load.cache.is')]")[0].get('src') res = requests.get(cache_url) cache_ip = re.search('"([^"]+)"', res.content).group(1) # Now that we have the ip address we can insert it into the URL - source_js = doc.xpath("//script[contains(., 'tengipunktur')]")[0].text + source_js = doc.findall(".//script[contains(., 'tengipunktur')]")[0].text source_url = re.search(r"'file': '(http://' \+ tengipunktur \+ '[^']+)", source_js).group(1) rtmp_url = source_url.replace("' + tengipunktur + '", cache_ip) @@ -100,15 +95,15 @@ def get_tab_items(url): episodes = [] #Every tab has a player with the newest/featured item. Get the name of it. - featured_item = doc.xpath("//div[@class='kubbur sarpefst']/div/h1") + featured_item = doc.findall(".//div[@class='kubbur sarpefst']/div/h1") if featured_item: featured_name = featured_item[0].text episodes.append((featured_name, url)) #Get the listed items - for item in doc.xpath("//ul[@class='sarplisti']/li"): - item_link = item.xpath("a")[0].attrib - item_date = item.xpath("em")[0].text + for item in doc.findall(".//ul[@class='sarplisti']/li"): + item_link = item.findall("a")[0].attrib + item_date = item.findall("em")[0].text page_url = u"http://www.ruv.is{0}".format(item_link['href']) title = u"{0} - {1}".format(item_link.get('title'), item_date) episodes.append((title, page_url)) @@ -120,9 +115,9 @@ def get_podcast_shows(): doc = get_document("http://www.ruv.is/podcast") shows = [] - for show in doc.xpath("//ul[@class='hladvarp-info']"): - title = show.xpath('li/h4')[0].text - url = show.xpath("li/a[contains(@href,'http')]")[0].attrib.get('href') + for show in doc.findall(".//ul[@class='hladvarp-info']"): + title = show.findall('li/h4')[0].text + url = show.findall("li/a[contains(@href,'http')]")[0].attrib.get('href') shows.append((title, url)) return shows @@ -132,14 +127,14 @@ def get_podcast_episodes(url): doc = get_document(url) episodes = [] - for item in doc.findall("//guid"): + for item in doc.findall(".//guid"): url = item.text for element in item.itersiblings(): if element.tag == 'pubdate': date = element.text - #date = item.xpath('pubdate')[0].text - #url = item.xpath('guid')[0].text + #date = item.findall('pubdate')[0].text + #url = item.findall('guid')[0].text episodes.append((date, url)) return episodes @@ -150,5 +145,5 @@ def get_live_url(channel='ruv'): } doc = get_document(page_urls.get(channel)) - return doc.xpath("//div[@id='spilarinn']/video/source")[0].attrib['src'] + return doc.findall(".//div[@id='spilarinn']/video/source")[0].attrib['src']