-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
50 lines (46 loc) · 1.39 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests
import time
from BeautifulSoup import BeautifulSoup
import re
from time import sleep
prog_start=time.time()
team_list=["Manchester United","Chelsea","Liverpool","Arsenal","Tottenham Hotspur","Southampton","Manchester City","Real Madrid","Barcelona"]
def crawl(url):
while True:
try:
r=requests.get(url)
return r.content
break
except Exception as e:
print e
sleep(2)
print "Retrying!!"
base_url = "http://www.squawka.com/teams/"
def makeMyURL(teamname):
teamname=teamname.lower()
teamname=teamname.replace(" ","-")
return base_url+teamname+"/squad"
for team in team_list:
fetch_team=time.time()
url=makeMyURL(team)
print "Starting to crawl "+team+" from "+url
html = crawl(url)
print "Crawled page!"
soup=BeautifulSoup(html)
player_names=soup.findAll("td", { "class" : "squadplayerphoto" })
print player_names
player_links=soup.findAll("a", { "class" : "teamstatinfo" })
print "-----------------------------------------------"
print " PLAYERS FROM "+team
print "-----------------------------------------------"
i=0
for player in player_links:
start="<div>"
end="</div>"
print re.search('%s(.*)%s' % (start, end),str(player_names[i])).group(1)
start="players/"
end="/stats/../stats\">"
print re.search('%s(.*)%s' % (start, end),str(player)).group(1)
i+=1
print ""
print "Time taken is"+str(time.time()-fetch_team)+" s"