Moving to BeautifulSoup. Radio not ready yet.

Dagur · Oct 7, 2014 · c24b231 · c24b231
1 parent 78f471e
commit c24b231
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 44 deletions.
diff --git a/plugin.video.sarpur/sarpur/__init__.py b/plugin.video.sarpur/sarpur/__init__.py
@@ -4,7 +4,7 @@
 import sys
 import xbmcaddon
 
-ALWAYS_REFRESH = False
+ALWAYS_REFRESH = True
 LOGGING_ENABLED = True
 
 BASE_URL = sys.argv[0]

diff --git a/plugin.video.sarpur/sarpur/cached.py b/plugin.video.sarpur/sarpur/cached.py
@@ -20,28 +20,25 @@ def __init__(self):
             delta = datetime.now() - datetime.fromtimestamp(last_modified)
 
             if sarpur.ALWAYS_REFRESH or delta.days > 0:
-                showtree = self.update_showtree()
-                tabs = self.update_tabs()
+                self.update_showtree()
+                self.update_tabs()
             else:
-                tabs = json.load(file(TABFILE_LOCATION, 'rb'))
-                showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb'))
+                self.tabs = json.load(file(TABFILE_LOCATION, 'rb'))
+                self.showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb'))
 
         except OSError:
             if not os.path.exists(DATA_PATH):
                 os.makedirs(DATA_PATH)
-            showtree = self.update_showtree()
-            tabs = self.update_tabs()
+            self.update_showtree()
+            self.update_tabs()
         except IOError:
             try:
                 os.unlink(SHOWTREEFILE_LOCATION)
                 os.unlink(TABFILE_LOCATION)
             except OSError:
                 None
-            showtree = self.update_showtree()
-            tabs = self.update_tabs()
-
-        self.tabs = tabs
-        self.showtree = showtree
+            self.update_showtree()
+            self.update_tabs()
 
     def update_tabs(self):
         "populate latest_groups"

diff --git a/plugin.video.sarpur/sarpur/scraper.py b/plugin.video.sarpur/sarpur/scraper.py
@@ -1,19 +1,14 @@
 #!/usr/bin/env python
 # encoding: UTF-8
 
-import requests, re
-from html5lib import treebuilders
-from xml.etree import ElementTree
+import re
+import requests
+from bs4 import BeautifulSoup
 
 
 def get_document(url):
     req = requests.get(url)
-    source = req.content
-
-    builder = treebuilders.getTreeBuilder("etree", ElementTree)
-    doc = html5lib.parse(source,
-                         treebuilder=builder,
-                         namespaceHTMLElements=False)
+    doc = BeautifulSoup(req.content)
 
     return doc
 
@@ -23,14 +18,14 @@ def get_episodes(url):
     doc = get_document(url)
 
     #Generic look
-    for episode in doc.xpath("//a[contains(@title, 'Spila')]"):
+    for episode in doc.findall(".//a[contains(@title, 'Spila')]"):
         episodes.append((episode.text, episode.get('href')))
 
     if episodes:
         return episodes
 
     #"Special" page
-    for episode in doc.xpath("//div[contains(@class,'mm-mynd')]"):
+    for episode in doc.findall(".//div[contains(@class,'mm-mynd')]"):
         episode_date = episode.getparent().find('span').text
         url = u'http://www.ruv.is{0}'.format(episode.find('a').attrib.get('href'))
         episodes.append((episode_date, url))
@@ -39,10 +34,10 @@ def get_episodes(url):
 
 def get_tabs():
     doc = get_document("http://www.ruv.is/sarpurinn")
-    xpathstring = "//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a"
+    findallstring = ".//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a"
     tabs = []
 
-    for hyperlink in doc.xpath(xpathstring):
+    for hyperlink in doc.findall(findallstring):
         tabs.append((hyperlink.text, hyperlink.get('href')))
 
     return tabs
@@ -51,17 +46,17 @@ def get_showtree():
     doc = get_document('http://dagskra.ruv.is/thaettir/')
     showtree = []
 
-    for i, channel in enumerate(doc.xpath("//div[@style]")):
+    #.//div[@style]
+    for i, channel in enumerate(doc.findall("div", style=True)):
         channel_name = channel.find("h1").text
         showtree.append({"name": channel_name, "categories": []})
 
-        for group in channel.find("div").iterchildren():
+        for group in channel.find("div"):
             if group.tag == 'h2':
                 showtree[i]["categories"].append(
                     {"name":group.text, "shows":[]})
             elif group.tag == 'div':
-                for show in group.findall("div"):
-                    hyperlink = show.find("a")
+                for hyperlink in group.findall("div/a"):
                     show_info = (hyperlink.text, hyperlink.get('href'))
                     showtree[i]["categories"][-1]['shows'].append(show_info)
     return showtree
@@ -71,23 +66,23 @@ def get_stream_info(page_url):
     doc = get_document(page_url)
 
     #Get urls for the swf player and playpath
-    params = doc.xpath('//param')
+    params = doc.findall('.//param')
     swfplayer = 'http://ruv.is{0}'.format(params[0].get('value'))
     details = params[1].get('value')
     playpath = re.search('streamer=(.*?)&(file=.*?)&stre', details).group(2)
 
     # Get the url of the actual rtmp stream
-    source_tags = doc.xpath('//source')
+    source_tags = doc.findall('.//source')
     if source_tags and source_tags[0].attrib.get('src'): #RÚV
         rtmp_url = source_tags[0].get('src')
     else: #RÁS 1 & 2
         # The ip address of the stream server is returned in another page
-        cache_url = doc.xpath("//script[contains(@src, 'load.cache.is')]")[0].get('src')
+        cache_url = doc.findall(".//script[contains(@src, 'load.cache.is')]")[0].get('src')
         res = requests.get(cache_url)
         cache_ip = re.search('"([^"]+)"', res.content).group(1)
 
         # Now that we have the ip address we can insert it into the URL
-        source_js = doc.xpath("//script[contains(., 'tengipunktur')]")[0].text
+        source_js = doc.findall(".//script[contains(., 'tengipunktur')]")[0].text
         source_url = re.search(r"'file': '(http://' \+ tengipunktur \+ '[^']+)", source_js).group(1)
 
         rtmp_url = source_url.replace("' + tengipunktur + '", cache_ip)
@@ -100,15 +95,15 @@ def get_tab_items(url):
     episodes = []
 
     #Every tab has a player with the newest/featured item. Get the name of it.
-    featured_item = doc.xpath("//div[@class='kubbur sarpefst']/div/h1")
+    featured_item = doc.findall(".//div[@class='kubbur sarpefst']/div/h1")
     if featured_item:
         featured_name = featured_item[0].text
         episodes.append((featured_name, url))
 
     #Get the listed items
-    for item in doc.xpath("//ul[@class='sarplisti']/li"):
-        item_link = item.xpath("a")[0].attrib
-        item_date = item.xpath("em")[0].text
+    for item in doc.findall(".//ul[@class='sarplisti']/li"):
+        item_link = item.findall("a")[0].attrib
+        item_date = item.findall("em")[0].text
         page_url = u"http://www.ruv.is{0}".format(item_link['href'])
         title = u"{0} - {1}".format(item_link.get('title'), item_date)
         episodes.append((title, page_url))
@@ -120,9 +115,9 @@ def get_podcast_shows():
     doc = get_document("http://www.ruv.is/podcast")
     shows = []
 
-    for show in doc.xpath("//ul[@class='hladvarp-info']"):
-        title = show.xpath('li/h4')[0].text
-        url = show.xpath("li/a[contains(@href,'http')]")[0].attrib.get('href')
+    for show in doc.findall(".//ul[@class='hladvarp-info']"):
+        title = show.findall('li/h4')[0].text
+        url = show.findall("li/a[contains(@href,'http')]")[0].attrib.get('href')
         shows.append((title, url))
 
     return shows
@@ -132,14 +127,14 @@ def get_podcast_episodes(url):
     doc = get_document(url)
     episodes = []
 
-    for item in doc.findall("//guid"):
+    for item in doc.findall(".//guid"):
         url = item.text
         for element in item.itersiblings():
             if element.tag == 'pubdate':
                 date = element.text
 
-        #date = item.xpath('pubdate')[0].text
-        #url = item.xpath('guid')[0].text
+        #date = item.findall('pubdate')[0].text
+        #url = item.findall('guid')[0].text
         episodes.append((date, url))
 
     return episodes
@@ -150,5 +145,5 @@ def get_live_url(channel='ruv'):
         }
 
     doc = get_document(page_urls.get(channel))
-    return doc.xpath("//div[@id='spilarinn']/video/source")[0].attrib['src']
+    return doc.findall(".//div[@id='spilarinn']/video/source")[0].attrib['src']