forked from yamins81/govdata-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fed_parse.py
33 lines (26 loc) · 1.31 KB
/
fed_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import re
from BeautifulSoup import BeautifulSoup
from starflow import utils
##checks whether it is available in the data download program of the fed site
def hasDataDownloadProgram(li):
x = re.compile('.*\.aspx.*')
return li.findAll('a',{'href':x}) != []
##extacts the rel from the actual links
def extractRel(urlString):
regex = re.compile('.*\.aspx\?rel=(.*)')
return regex.match(urlString).groups()[0]
def getDataCodes(string):
pattern = '([A-Z](\.[0-9]+)+)+'
all = re.findall(pattern,string)
return [match[0] for match in all]
##apply this to an element of the list, to get all the Data codes it refers to in the content sections
def getContentDataCodes(li):
return set(getDataCodes(getContentStringFromLi(li)))
##looks at both the contents and the contents of the children
def getContentStringFromLi(li):
return reduce(lambda x,y: x + y, [utils.Contents(li)] + [utils.Contents(achild) for achild in li.findAll('a')])
file = open('statisticsdata.htm')
linkTable = BeautifulSoup(file,convertEntities='html').find('table',{'class':'stats'})
headers = linkTable.findAll('h2')
links = [header.findNext().findAll('li') for header in headers]
res = [[(utils.Contents(h), getContentStringFromLi(l), getContentDataCodes(l), hasDataDownloadProgram(l)) for l in L] for (L,h) in zip(links,headers)]