-
Notifications
You must be signed in to change notification settings - Fork 10
/
scraper.py
71 lines (62 loc) · 2.25 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""Scrapes toti.eu.com/beatles for all song lyrics and dumps them to file.
The dump has the format:
<title 1>\t<writers>\t<line 1>\\<line 2>\\...\\<line n>\n
<title 2>\t<writers>\t<line 1>\\<line 2>\\...\\<line n>\n
...
<title m>\t<writers>\t<line 1>\\<line 2>\\...\\<line n>\n
"""
import bs4
import re
import requests
BASE_URL = ' http://www.toti.eu.com/beatles/'
TAGS_TO_FILTER = ('style', 'script', '[document]', 'head', 'title')
SONG_MATCHER = 'showsong\.asp\?id\=[0-9]+'
MATCHERS = (
'Visit also:',
'www',
'#BeginLibraryItem',
'#EndLibraryItem',
'The Beatles Lyrics Repository',
'Main page')
def include(element):
if element.parent.name in TAGS_TO_FILTER:
return False
element = element.lstrip()
if not element:
return False
for matcher in MATCHERS:
if element.startswith(matcher):
return False
return True
def process(element):
# TODO(eugenhotaj): Handle very rare cases where python does not strip
# carrige return correctly. Currenly I'm just doing a manual delete in vim.
element = element.strip()
# Replace weird appostrophes and quotations.
element.replace('’', '\'')
element.replace('`', '\'')
element.replace('“','"')
element.replace('”','"')
return element
def scrape_song(url):
resp = requests.get(url)
soup = bs4.BeautifulSoup(resp.text, features='lxml')
elements = soup.findAll(text=True)
elements = [element for element in elements if include(element)]
elements = [process(element) for element in elements]
title, writer = elements[:2]
lines= elements[2:]
return title, writer, lines
if __name__ == "__main__":
resp = requests.get(BASE_URL + 'showall.asp')
song_urls = re.findall(SONG_MATCHER, resp.text)
database = []
for i, song_url in enumerate(song_urls):
title, writer, lines = scrape_song(BASE_URL + song_url)
database.append((title, writer, lines))
if i % 10 == 0:
print("Scraped %d files; %d remaining." % (i, len(song_urls) - i))
with open('dataset.txt', 'w') as file_:
for title, writer, lines in database:
line = "{}\t{}\t{}\n".format(title, writer, "\\".join(lines))
file_.write(line)