-
Notifications
You must be signed in to change notification settings - Fork 74
/
run_scrapers.py
executable file
·266 lines (205 loc) · 10.7 KB
/
run_scrapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!script
# ./run_scrapers.py text bills votes stats
import os, os.path, glob, re, hashlib, shutil, sys, datetime
from django.conf import settings
from exclusiveprocess import Lock
# Ensure we don't run scrapers concurrently since they can
# mess each other up.
Lock(die=True).forever()
CONGRESS = int(os.environ.get("CONGRESS", settings.CURRENT_CONGRESS))
# UTILS
def md5(fn, modulo=None):
# do an MD5 on the file but run a regex first
# to remove content we don't want to check for
# differences.
with open(fn, "rb") as fobj:
data = fobj.read()
if modulo != None: data = re.sub(modulo, b"--", data)
md5 = hashlib.md5()
md5.update(data)
return md5.digest()
def copy(fn1, fn2, modulo):
# Don't copy true unchanged files because we want to keep
# file contents the same so long as no real data changed.
# When we load into our db, we use hashes to check if we
# need to process a file. And for rsync users, don't make
# them re-download files that have no real changes.
if os.path.exists(fn2):
if md5(fn1, modulo) == md5(fn2, modulo):
return False
#print fn2
shutil.copy2(fn1, fn2)
return True
# MAIN
# Set options.
log_level = "error"
if "DEBUG" in os.environ: log_level = "info"
# Run scrapers and parsers.
if "people" in sys.argv:
if CONGRESS != settings.CURRENT_CONGRESS: raise ValueErrror()
# Pull latest people YAML.
os.system("cd %s/congress-legislators; git fetch -pq" % settings.CONGRESS_PROJECT_PATH)
os.system("cd %s/congress-legislators; git merge --ff-only -q origin/main" % settings.CONGRESS_PROJECT_PATH)
# Load YAML (directly) into db.
os.system("./parse.py person") # -l ERROR
os.system("./manage.py update_index -v 0 -u person person")
#os.system("./manage.py prune_index -u person person")
# Save a fixture.
os.system("./manage.py dumpdata --format json person > data/db/django-fixture-people.json")
if "committees" in sys.argv:
if CONGRESS != settings.CURRENT_CONGRESS: raise ValueErrror()
# Committee metadata.
# Pull latest YAML.
os.system("cd %s/congress-legislators; git fetch -pq" % settings.CONGRESS_PROJECT_PATH)
os.system("cd %s/congress-legislators; git merge --ff-only -q origin/main" % settings.CONGRESS_PROJECT_PATH)
# Committee events.
os.system("cd %s; usc-run committee_meetings --docs=False --log=%s" % (settings.CONGRESS_PROJECT_PATH, log_level))
# Load into db.
os.system("./parse.py -l ERROR committee")
# Generate historical XML, used by prognosis & session stats.
os.system(". %s/congress-legislators/scripts/.env/bin/activate; python committee/archive_committee_membership.py %s/congress-legislators/ data/historical-committee-membership/%s.xml"
% (settings.CONGRESS_PROJECT_PATH, settings.CONGRESS_PROJECT_PATH, CONGRESS))
# Save a fixture.
os.system("./manage.py dumpdata --format json committee.Committee committee.CommitteeMember > data/db/django-fixture-committees.json")
do_bill_parse = False
if "text" in sys.argv:
# Do this before bills because the process of loading into the db checks for new
# bill text and generates feed events for text availability.
# Update the mirror of bill text from GPO's GovInfo.gov.
os.system("cd %s; usc-run govinfo --collections=BILLS --extract=mods,text,xml --years=%s --log=%s" % (settings.CONGRESS_PROJECT_PATH,
",".join(str(datetime.datetime.now().year + d) for d in (-1, 0)), log_level))
# Also metadata for committee reports.
os.system("cd %s; usc-run govinfo --collections=CRPT --extract=mods --years=%s --log=%s" % (settings.CONGRESS_PROJECT_PATH,
",".join(str(datetime.datetime.now().year + d) for d in (-1, 0)), log_level))
# Update text incorporation analysis for any new text versions.
os.system("analysis/text_incorporation.py analyze %d" % CONGRESS)
os.system("analysis/text_incorporation.py load %d" % CONGRESS)
# Clear old bill text PDF thumbnail images. They'll be regenerated on the fly if a user visits a page that needs it.
find_cmd = """find congress/ -mtime +90 \( -name "document-_text_image*.png" -or -name "document-card*.png" -or -name "document-thumbnail*.png" \) -print0"""
#os.system(find_cmd + """ | du --files0-from=- -hc | tail -n1""") # show total size on disk that would be reclaimed
#os.system(find_cmd + """ | xargs -0 rm""")
# don't know if we got any new files, so assume we now need to update bills
do_bill_parse = True
if "bills" in sys.argv:
# Scrape.
if CONGRESS >= 114:
os.system("cd %s; usc-run govinfo --bulkdata=BILLSTATUS --log=%s; usc-run bills --govtrack --congress=%d --log=%s" % (settings.CONGRESS_PROJECT_PATH, log_level, CONGRESS, log_level))
# Scrape upcoming House bills.
os.system("cd %s; usc-run upcoming_house_floor --download --log=%s" % (settings.CONGRESS_PROJECT_PATH, log_level))
do_bill_parse = True
# os.system("./manage.py dumpdata --format json bill.BillTerm > data/db/django-fixture-billterms.json")
if do_bill_parse:
# Load into db.
os.system("./parse.py --congress=%d -l %s bill" % (CONGRESS, log_level))
os.system("./parse.py --congress=%d -l %s amendment" % (CONGRESS, log_level))
os.system("./manage.py shell -c \"import bill.models; bill.models.Bill.LoadStatementsOfAdministrationPolicy()\" > /dev/null")
# bills are indexed as they are parsed, but to
# freshen the index... Because bills index full text and so
# indexing each time is substantial, set the TIMEOUT and
# BATCH_SIZE options in the haystack connections appropriately.
# ./manage.py update_index -v 2 -u bill bill
# Or for a single Congress:
# import tqdm
# from bill.models import Bill
# from bill.search_indexes import BillIndex
# bill_index = BillIndex()
# for bill in tqdm.tqdm(Bill.objects.filter(congress=115)):
# bill.update_index(bill_index)
if "votes" in sys.argv:
# Scrape.
if CONGRESS >= 101:
os.system("cd %s; usc-run votes --govtrack --log=%s --force --fast --congress=%d" % (settings.CONGRESS_PROJECT_PATH, log_level, CONGRESS))
# Load into db.
os.system("./parse.py vote --congress=%d -l %s" % (CONGRESS, log_level))
# Update key votes analysis.
os.system("analysis/key_votes.py %d" % CONGRESS)
# During election season.
#os.system("analysis/missed_votes_prezcandidates.py > /tmp/votes-$$.json && mv /tmp/votes-$$.json data/analysis/presidential-candidates-missed-votes.json")
if "stats" in sys.argv:
os.system("analysis/sponsorship_analysis.py %d" % CONGRESS)
os.system("analysis/missed_votes.py %d" % CONGRESS)
if "am_mem_bills" in sys.argv:
# American Memory
os.syste("for c in {6..42}; do echo $c; ./parse.py bill --force --congress=$c --level=warn; done")
if "stat_bills" in sys.argv:
# Pull in statutes from the 85th-92nd Congress
# via the GPO's Statutes at Large.
os.system("cd %s; usc-run govinfo --collections=STATUTE --extract=mods --log=%s" % (settings.CONGRESS_PROJECT_PATH, "warn")) # log_level
os.system("cd %s; usc-run statutes --volumes=65-86 --log=%s" % (settings.CONGRESS_PROJECT_PATH, "warn")) # log_level
os.system("cd %s; usc-run statutes --volumes=87-106 --textversions --log=%s" % (settings.CONGRESS_PROJECT_PATH, "warn")) # log_level
# Copy bill metadata into our legacy location.
# (No need to copy text-versions anywhere: we read it from the congress data directory.)
for congress in range(82, 92+1):
print(congress, "...")
# Load into db.
os.system("./parse.py --congress=%d bill" % congress) # -l ERROR
if "photos" in sys.argv:
# Pull in any new photos from the unitedstates/images repository.
import person.models, os, shutil, yaml
#os.system("cd ../scripts/congress-images; git pull --rebase")
src = '../scripts/congress-images/congress/original/'
dst = 'static/legislator-photos/'
# Get a list of GovTrack IDs and Bioguide IDs for which photos are provided
# in the unitedstates/images repo. Only import photos of current Members of
# Congress because I haven't reviewed older photos necessarily.
bioguide_ids = [f[len(src):-4] for f in glob.glob(src + '*.jpg')]
id_pairs = person.models.Person.objects.filter(
bioguideid__in=bioguide_ids,
roles__current=True)\
.values_list('id', 'bioguideid')
for govtrack_id, bioguide_id in id_pairs:
# source JPEG & sanity check that it exists
fn1 = src + bioguide_id + ".jpg"
if not os.path.exists(fn1):
print("Missing: " + fn1)
continue
# destination file name
fn2 = dst + str(govtrack_id) + ".jpeg"
# need to review?
if not (os.path.exists(fn2) and md5(fn1) == md5(fn2)):
p = person.models.Person.objects.get(id=govtrack_id)
r = p.roles.get(current=True)
print(("change" if os.path.exists(fn2) else "new"), p)
print("<hr><p>%s</p>" % p.name.encode("utf8"))
print("<table cols=2><tr>")
if os.path.exists(fn2):
print("<td><img src='https://www.govtrack.us/static/legislator-photos/%d.jpeg'></td>" % p.id)
else:
print("<iframe src='%s' width=100%% height=500> </iframe>" % ("https://twitter.com/"+p.twitterid if p.twitterid else r.website))
print("<td><img src='https://raw.githubusercontent.com/unitedstates/images/newscraper/congress/original/%s.jpg'></td>" % bioguide_id)
print("</tr></table>")
metadata = yaml.load(open(fn1.replace("/original/", "/metadata/").replace(".jpg", ".yaml")))
print("<p>%s</p><p>%s</p>" % (metadata['link'], metadata['name']))
continue
# check if the destination JPEG already exists and it has different content
if os.path.exists(fn2) and md5(fn1) != md5(fn2):
# Back up the existing files first. If we already have a backed up
# image, don't overwrite the back up. Figure out what to do another
# time and just bail now. Check that we won't overwrite any files
# before we attempt to move them.
def get_archive_fn(fn):
return fn.replace("/photos/", "/photos/archive/")
files_to_archive = [fn2] + glob.glob(fn2.replace(".jpeg", "-*"))
for fn in files_to_archive:
if os.path.exists(get_archive_fn(fn)):
raise ValueError("Archived photo already exists: " + fn)
# Okay now actually do the backup.
for fn in files_to_archive:
print(fn, "=>", get_archive_fn(fn))
shutil.move(fn, get_archive_fn(fn))
# Copy in the file if it's new.
if copy(fn1, fn2, None):
print(fn1, "=>", fn2)
# get required metadata
metadata = yaml.load(open(fn1.replace("/original/", "/metadata/").replace(".jpg", ".yaml")))
if metadata.get("name", "").strip() == "": raise ValueError("Metadata is missing name.")
if metadata.get("link", "").strip() == "": raise ValueError("Metadata is missing link.")
# Write the metadata.
with open(fn2.replace(".jpeg", "-credit.txt"), "w") as credit_file:
credit_file.write( (metadata.get("link", "").strip() + " " + metadata.get("name", "").strip() + "\n").encode("utf-8") )
# Generate resized versions.
for size_width in (50, 100, 200):
size_height = int(round(size_width * 1.2))
os.system("convert %s -resize %dx%d^ -gravity center -extent %dx%d %s"
% (fn2, size_width, size_height, size_width, size_height,
fn2.replace(".jpeg", ("-%dpx.jpeg" % size_width)) ))