Skip to content

Commit

Permalink
Moving to BeautifulSoup. Radio not ready yet.
Browse files Browse the repository at this point in the history
  • Loading branch information
Dagur committed Oct 7, 2014
1 parent 78f471e commit c24b231
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 44 deletions.
2 changes: 1 addition & 1 deletion plugin.video.sarpur/sarpur/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
import xbmcaddon

ALWAYS_REFRESH = False
ALWAYS_REFRESH = True
LOGGING_ENABLED = True

BASE_URL = sys.argv[0]
Expand Down
19 changes: 8 additions & 11 deletions plugin.video.sarpur/sarpur/cached.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,25 @@ def __init__(self):
delta = datetime.now() - datetime.fromtimestamp(last_modified)

if sarpur.ALWAYS_REFRESH or delta.days > 0:
showtree = self.update_showtree()
tabs = self.update_tabs()
self.update_showtree()
self.update_tabs()
else:
tabs = json.load(file(TABFILE_LOCATION, 'rb'))
showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb'))
self.tabs = json.load(file(TABFILE_LOCATION, 'rb'))
self.showtree = json.load(file(SHOWTREEFILE_LOCATION, 'rb'))

except OSError:
if not os.path.exists(DATA_PATH):
os.makedirs(DATA_PATH)
showtree = self.update_showtree()
tabs = self.update_tabs()
self.update_showtree()
self.update_tabs()
except IOError:
try:
os.unlink(SHOWTREEFILE_LOCATION)
os.unlink(TABFILE_LOCATION)
except OSError:
None
showtree = self.update_showtree()
tabs = self.update_tabs()

self.tabs = tabs
self.showtree = showtree
self.update_showtree()
self.update_tabs()

def update_tabs(self):
"populate latest_groups"
Expand Down
59 changes: 27 additions & 32 deletions plugin.video.sarpur/sarpur/scraper.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
#!/usr/bin/env python
# encoding: UTF-8

import requests, re
from html5lib import treebuilders
from xml.etree import ElementTree
import re
import requests
from bs4 import BeautifulSoup


def get_document(url):
req = requests.get(url)
source = req.content

builder = treebuilders.getTreeBuilder("etree", ElementTree)
doc = html5lib.parse(source,
treebuilder=builder,
namespaceHTMLElements=False)
doc = BeautifulSoup(req.content)

return doc

Expand All @@ -23,14 +18,14 @@ def get_episodes(url):
doc = get_document(url)

#Generic look
for episode in doc.xpath("//a[contains(@title, 'Spila')]"):
for episode in doc.findall(".//a[contains(@title, 'Spila')]"):
episodes.append((episode.text, episode.get('href')))

if episodes:
return episodes

#"Special" page
for episode in doc.xpath("//div[contains(@class,'mm-mynd')]"):
for episode in doc.findall(".//div[contains(@class,'mm-mynd')]"):
episode_date = episode.getparent().find('span').text
url = u'http://www.ruv.is{0}'.format(episode.find('a').attrib.get('href'))
episodes.append((episode_date, url))
Expand All @@ -39,10 +34,10 @@ def get_episodes(url):

def get_tabs():
doc = get_document("http://www.ruv.is/sarpurinn")
xpathstring = "//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a"
findallstring = ".//div[@class='menu-block-ctools-menu-sarpsmynd-1 menu-name-menu-sarpsmynd parent-mlid-_active:0 menu-level-2']/ul/li/a"
tabs = []

for hyperlink in doc.xpath(xpathstring):
for hyperlink in doc.findall(findallstring):
tabs.append((hyperlink.text, hyperlink.get('href')))

return tabs
Expand All @@ -51,17 +46,17 @@ def get_showtree():
doc = get_document('http://dagskra.ruv.is/thaettir/')
showtree = []

for i, channel in enumerate(doc.xpath("//div[@style]")):
#.//div[@style]
for i, channel in enumerate(doc.findall("div", style=True)):
channel_name = channel.find("h1").text
showtree.append({"name": channel_name, "categories": []})

for group in channel.find("div").iterchildren():
for group in channel.find("div"):
if group.tag == 'h2':
showtree[i]["categories"].append(
{"name":group.text, "shows":[]})
elif group.tag == 'div':
for show in group.findall("div"):
hyperlink = show.find("a")
for hyperlink in group.findall("div/a"):
show_info = (hyperlink.text, hyperlink.get('href'))
showtree[i]["categories"][-1]['shows'].append(show_info)
return showtree
Expand All @@ -71,23 +66,23 @@ def get_stream_info(page_url):
doc = get_document(page_url)

#Get urls for the swf player and playpath
params = doc.xpath('//param')
params = doc.findall('.//param')
swfplayer = 'http://ruv.is{0}'.format(params[0].get('value'))
details = params[1].get('value')
playpath = re.search('streamer=(.*?)&(file=.*?)&stre', details).group(2)

# Get the url of the actual rtmp stream
source_tags = doc.xpath('//source')
source_tags = doc.findall('.//source')
if source_tags and source_tags[0].attrib.get('src'): #RÚV
rtmp_url = source_tags[0].get('src')
else: #RÁS 1 & 2
# The ip address of the stream server is returned in another page
cache_url = doc.xpath("//script[contains(@src, 'load.cache.is')]")[0].get('src')
cache_url = doc.findall(".//script[contains(@src, 'load.cache.is')]")[0].get('src')
res = requests.get(cache_url)
cache_ip = re.search('"([^"]+)"', res.content).group(1)

# Now that we have the ip address we can insert it into the URL
source_js = doc.xpath("//script[contains(., 'tengipunktur')]")[0].text
source_js = doc.findall(".//script[contains(., 'tengipunktur')]")[0].text
source_url = re.search(r"'file': '(http://' \+ tengipunktur \+ '[^']+)", source_js).group(1)

rtmp_url = source_url.replace("' + tengipunktur + '", cache_ip)
Expand All @@ -100,15 +95,15 @@ def get_tab_items(url):
episodes = []

#Every tab has a player with the newest/featured item. Get the name of it.
featured_item = doc.xpath("//div[@class='kubbur sarpefst']/div/h1")
featured_item = doc.findall(".//div[@class='kubbur sarpefst']/div/h1")
if featured_item:
featured_name = featured_item[0].text
episodes.append((featured_name, url))

#Get the listed items
for item in doc.xpath("//ul[@class='sarplisti']/li"):
item_link = item.xpath("a")[0].attrib
item_date = item.xpath("em")[0].text
for item in doc.findall(".//ul[@class='sarplisti']/li"):
item_link = item.findall("a")[0].attrib
item_date = item.findall("em")[0].text
page_url = u"http://www.ruv.is{0}".format(item_link['href'])
title = u"{0} - {1}".format(item_link.get('title'), item_date)
episodes.append((title, page_url))
Expand All @@ -120,9 +115,9 @@ def get_podcast_shows():
doc = get_document("http://www.ruv.is/podcast")
shows = []

for show in doc.xpath("//ul[@class='hladvarp-info']"):
title = show.xpath('li/h4')[0].text
url = show.xpath("li/a[contains(@href,'http')]")[0].attrib.get('href')
for show in doc.findall(".//ul[@class='hladvarp-info']"):
title = show.findall('li/h4')[0].text
url = show.findall("li/a[contains(@href,'http')]")[0].attrib.get('href')
shows.append((title, url))

return shows
Expand All @@ -132,14 +127,14 @@ def get_podcast_episodes(url):
doc = get_document(url)
episodes = []

for item in doc.findall("//guid"):
for item in doc.findall(".//guid"):
url = item.text
for element in item.itersiblings():
if element.tag == 'pubdate':
date = element.text

#date = item.xpath('pubdate')[0].text
#url = item.xpath('guid')[0].text
#date = item.findall('pubdate')[0].text
#url = item.findall('guid')[0].text
episodes.append((date, url))

return episodes
Expand All @@ -150,5 +145,5 @@ def get_live_url(channel='ruv'):
}

doc = get_document(page_urls.get(channel))
return doc.xpath("//div[@id='spilarinn']/video/source")[0].attrib['src']
return doc.findall(".//div[@id='spilarinn']/video/source")[0].attrib['src']

0 comments on commit c24b231

Please sign in to comment.