forked from rdkls/aws_infra_map_neo4j
-
Notifications
You must be signed in to change notification settings - Fork 0
/
awless_to_neo.py
executable file
·451 lines (396 loc) · 17.9 KB
/
awless_to_neo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#!/usr/bin/env python
# Load your AWS environment into neo4j
# - use awless to query aws, store resource descriptions in badwolf rdf stores
# - clean thse files to be rdf-compliant
# - load into neo4j with semantic importer
# - update graph with node labels and properties for niceness
#
# awless uses standard aws env vars to do its thing; just set these as desired before running
#
# Examples:
#
# Do it all
# ./awless_to_neo.py
#
# Do one region, using existing badwolf dbs
# ./awless_to_neo.py --region=ap-southeast-2 --skip-sync
#
# References:
#
# awless
# https://github.com/wallix/awless
# RDF loader by jbarrasa
# https://jbarrasa.com/2016/06/07/importing-rdf-data-into-neo4j/
# https://github.com/jbarrasa/neosemantics
from neo4j.exceptions import ServiceUnavailable
from neo4j.v1 import GraphDatabase
from pprint import pprint
import argparse
import boto3
import json
import os
import re
import subprocess
import neobolt
DEBUG = True
AWLESS_DATABASE_PATH = '/root/.awless/aws/rdf/default/%s/'
CORRECTED_SUFFIX = '.corrected.nt'
THE_NEO4J_BASEDIR='/var/lib/neo4j'
THE_NEO4J_BASEDIR_DEBUG = '/Users/nick.doyle/ws/3rdparty/neo4j/aws_infra_map'
AWLESS_DATABASE_PATH_DEBUG = '/Users/nick.doyle/.awless/aws/rdf/default/%s/'
def get_neo4j_auth():
(user, password) = os.environ.get('NEO4J_AUTH').split('/')
return (user, password)
def correct_file(infile, region, debug):
# Output file will be prefix of the input file + '.corrected.nt'
# In the neo4j import dir
# (neo4j security requirement in order to be able to import)
(prefix, suffix) = os.path.splitext(os.path.split(infile)[1])
outfile = region + '-' + prefix + CORRECTED_SUFFIX
if debug:
neo4j_basedir = THE_NEO4J_BASEDIR_DEBUG
else:
neo4j_basedir = THE_NEO4J_BASEDIR
outfile = os.path.join(neo4j_basedir, 'import', outfile)
with open(outfile, 'w') as of:
with open(infile, 'r') as f:
for line in f:
line = line.strip()
(sub, pred, obj) = re.match('([^\s]+) ([^\s]+) (.*) \.', line).groups()
# SUB
# Prepend resource: if needed
if re.match('<[^:]*>', sub):
sub = re.sub('<(.*)>', '<resource:\\1>', sub)
# OBJ
# Prepend resource: if needed
if re.match('<[^:]*>', obj):
obj = re.sub('<(.*)>', '<resource:\\1>', obj)
# For s3 grantees, for some reason they have this format which causes it to be discarded hence not linked to the grant ...
obj = re.sub('_:(.*)', '<resource:\\1>', obj)
# Trim ^^ and anything following (happens when type is specified - we don't care they're all strings
obj = re.sub("(.+)\^\^.*", '\\1', obj)
# Replace <cloud-owl:*> with *
obj = re.sub("<cloud-owl:([^>]+)>", '"\\1"', obj)
# Replace <net-owl:*> with *
obj = re.sub("<net-owl:([^>]+)>", '"\\1"', obj)
# Move id to cloud_id; in cases where these are ARNs or UUIDs
# They won't get imported
# Hence we leave neo4j to set the node ID
# And move thse to their own prop "cloud_id" for later use/cleaning
# Replace <cloud:id> with <cloud:cloud_id>
pred = re.sub("<cloud:id>", "<cloud:cloud_id>", pred)
of.write('%s %s %s .\n' % (sub, pred, obj))
return outfile
def init_graph():
d = GraphDatabase.driver('bolt://127.0.0.1:7687', auth=get_neo4j_auth(), encrypted=False)
with d.session() as session:
# Create required indexes
cypher = 'CREATE CONSTRAINT n10s_unique_uri ON (r:Resource) ASSERT r.uri IS UNIQUE'
try:
session.run(cypher)
except neobolt.exceptions.ClientError:
# Index already exists
pass
# Init graph config for semantics plugin
cypher = 'CALL n10s.graphconfig.init()'
try:
session.run(cypher)
except neobolt.exceptions.ClientError:
# Index already exists
pass
def load_to_neo4j(filenames):
# Load the corrected rdj files from neo4j/import, into the db
d = GraphDatabase.driver('bolt://127.0.0.1:7687', auth=get_neo4j_auth(), encrypted=False)
with d.session() as session:
for fn in filenames:
# use jbarrasa's plugin to load the now-correct rdf into neo4j
# see https://github.com/jbarrasa/neosemantics
print 'load file %s' % fn
cypher = "call n10s.rdf.import.fetch('file:///%s', 'N-Triples', {shortenUrls: false})" % fn
with session.begin_transaction() as tx:
res = tx.run(cypher)
res.consume()
for r in res:
continue
print('... loaded')
def fix_db():
print('Fixing the Graph DB Labels, Props and Relationships with a bunch of dody inferences .....')
# Fix up the db for niceess - add node labels, set names
d = GraphDatabase.driver('bolt://127.0.0.1:7687', auth=get_neo4j_auth(), encrypted=False)
pprint(d)
with d.session() as session:
# Remove generic 'Resource:' label
cypher = """
match (n)
where n:Resource
CALL apoc.create.removeLabels(n, ['Resource'])
YIELD node
RETURN node
"""
session.run(cypher)
# Strip redundant resource: on names & uris
# (we hacked this on before so RDF import didn't fail)
cypher = """
match (n)
CALL apoc.create.setProperty(n, 'name', replace(n.name, 'resource:', ''))
YIELD node
RETURN node
"""
session.run(cypher)
cypher = """
match (n)
CALL apoc.create.setProperty(n, 'uri', replace(n.uri, 'resource:', ''))
YIELD node
RETURN node
"""
session.run(cypher)
# Merge on ns0 & ns1 cloud_id
# (don't do all in one query lest you break neo4j)
cypher = """
MATCH (n),
(p)
WHERE exists(n.ns0__cloud_id)
and
n.ns0__cloud_id = p.ns0__cloud_id
and
id(n) < id(p)
WITH [n,p] as nodes
CALL apoc.refactor.mergeNodes(nodes)
YIELD node
RETURN node
"""
session.run(cypher)
cypher = """
MATCH (n),
(p)
WHERE exists(n.ns1__cloud_id)
and
n.ns1__cloud_id = p.ns1__cloud_id
and
id(n) < id(p)
WITH [n,p] as nodes
CALL apoc.refactor.mergeNodes(nodes)
YIELD node
RETURN node
"""
session.run(cypher)
# AMIs and AMI Locations
# Need to do this before setting Label based on ns0__type
# Since AMIs have this field set to "machine"
session.run("match (n) where n.uri =~ '^ami-.*' set n:Ami")
session.run("match (n)<-[:ns0__location]-(:Ami) where not labels(n) set n:AmiLocation")
# Set label based on ns0__type preferably
# then ns1 (the more-specific)
cypher = """
match (n)
where not labels(n)
CALL apoc.create.addLabels(n, [n.ns0__type])
YIELD node
RETURN node
"""
session.run(cypher)
cypher = """
match (n)
where not labels(n)
CALL apoc.create.addLabels(n, [n.ns1__type])
YIELD node
RETURN node
"""
session.run(cypher)
# Set Name on ns0__name preferably
# then ns1 (the more-specific)
cypher = """
match (n)
where not labels(n)
CALL apoc.create.setProperty(n, 'name', n.ns0__name)
YIELD node
RETURN node
"""
session.run(cypher)
cypher = """
match (n)
where not labels(n)
CALL apoc.create.setProperty(n, 'name', n.ns1__name)
YIELD node
RETURN node
"""
session.run(cypher)
session.run("match (n) where n.name =~ '^arn:aws:iam::.*:role.*' set n:Role")
session.run("match (n) where n.name =~ '^arn:aws:sns:.*' set n:SNSTopic")
session.run("match ()-[:`cloud:grantee`]->(n) set n:Grantee")
session.run("match (n {`cloud:granteeType`: 'CanonicalUser'}) set n:Grantee")
session.run("match ()-[:`cloud:securityGroups`]->(n) set n:Securitygroup")
session.run("match (n) where n.uri =~ '^subnet.*' set n:Subnet")
session.run("match (n) where n.uri =~ '.*FirewallRule.*' set n:FirewallRule")
session.run("match (n) where n.uri =~ '.*Route.*' set n:Route")
session.run("match (n) where n.uri =~ '^vpc-.*' set n:Vpc")
session.run("match (n) where n.uri =~ '^vol-.*' set n:Volume")
session.run("match (n)<-[:ns0__role]-() where not labels(n) set n:Role")
session.run("match (n)<-[:ns0__location]-(:Image) where not labels(n) set n:ImageLocation")
session.run("match (n)<-[:ns1__zone]-() where not labels(n) set n:Route53HostedZone")
session.run("match (n)<-[:ns1__associations]-() set n:RoutetableAssociation")
# Relate route table associations
session.run("""
match (a:RoutetableAssociation),
(s:Subnet)
where a.ns1__value = s.ns1__id
merge (a)-[:ns1__associationTo]->(s)
""")
# ECS
session.run("match (n)<-[:ns1__containersImages]-() where (not labels(n) or n:KeyValue) set n:Containerimage")
session.run("match (n:Containertask) set n.arn = n.name")
session.run("match (n:Container) where exists(n.ns0__name) set n.name = n.ns0__name")
session.run("match (n:Containertask) where exists(n.arn) set n.name = apoc.text.replace(n.arn, '^.*?/', '')")
session.run("match (n:Containertask) where exists(n.ns1__arn) set n.name = apoc.text.replace(n.ns1__arn, '^.*?/', '')")
session.run("match (n:Containertask) where exists(n.ns0__arn) set n.name = apoc.text.replace(n.ns0__arn, '^.*?/', '')")
session.run("match (n:Containercluster) where exists(n.name) set n.arn = n.name")
session.run("match (n:Containercluster) where exists(n.ns0__arn) set n.arn = n.ns0__arn")
session.run("match (n:Containercluster) set n.name = apoc.text.replace(n.arn, '^.*?/', '')")
# SNS & SQS
session.run("match (n:Topic) set n:SnsTopic")
session.run("match (n)<-[:ns0__topic]-() set n:SnsTopic")
session.run("match (n) where n:Topic CALL apoc.create.removeLabels(n, ['Topic']) YIELD node return node")
session.run("match (n:SnsTopic) set n.name = apoc.text.replace(n.uri, '^.*:', '')")
session.run("match (n:SnsTopic) set n.name = apoc.text.replace(n.name, '^.*:', '')")
session.run("match (n:Subscription) set n.name = apoc.text.replace(n.name, '^.*:', '')")
# Link up SNS Subscriptions to SQS Queues
session.run("match (s:Subscription), (q) where s.uri = q.ns0__arn merge (s)-[:subscription]->(q)")
session.run("match (n:Queue) set n.name = apoc.text.replace(n.name, '^.*:', '')")
# Lambdas (:Function)
session.run("match (n:Function) set n:Lambda")
session.run("match (n) where n:Function CALL apoc.create.removeLabels(n, ['Function']) YIELD node return node")
session.run("match (n:Lambda) set n.name=n.ns0__name")
# LBs & Target Groups
session.run("match (n)<-[:ns3__applyOn]-(:Targetgroup) where not labels(n) set n:TargetgroupTarget")
session.run("""
match (n:TargetgroupTarget),
(p:Networkinterface)
where n.uri = p.ns2__privateIP
merge (n)-[:trafficTo]->(p)
""")
session.run("""
match (n:Listener),
(p:Targetgroup)
where n.ns1__targetGroups = p.ns1__arn
merge (n)-[:forwardsTo]->(p)
""")
session.run("match (n:Targetgroup) set n.name=n.ns1__name")
session.run("match (n:Loadbalancer) set n.name=n.ns1__name")
session.run("match (n:Listener) set n.name=n.ns2__protocol + ':' + n.ns2__port")
# Names on SGs
session.run("match (n:FirewallRule) set n.name=n.ns2__cidr")
session.run("match (n:FirewallRule) where not exists(n.name) set n.name=n.ns1__source")
session.run("""
match (n)
where n.uri =~ '^arn:aws:iam:.*:role/.*'
set n:Role,
n.name = apoc.text.replace(n.uri, '^.*?/', '')
""")
# Label public and Private Subnets
session.run("""
match (s:Subnet)<-[:ns3__applyOn]-(rt:Routetable)-[:ns2__routes]->(r:Route)
where r.ns2__cidr='0.0.0.0/0'
and
r.ns2__routeTargets =~ '.*igw-.*'
set s:SubnetPublic
""")
session.run("""
match (s:Subnet)<-[:ns3__applyOn]-(rt:Routetable)-[:ns2__routes]->(r:Route)
where r.ns2__cidr='0.0.0.0/0'
and
r.ns2__routeTargets =~ '.*nat-.*'
set s:SubnetPrivate
""")
session.run("match (n:Queue) set n.name = apoc.text.replace(n.ns0__arn, '^.*?/', '')")
# Not super sure on "Grantee"
session.run("match (n) where n.ns0__granteeType = 'CanonicalUser' and not labels(n) set n:Grantee")
session.run("match (n)<-[:ns0__grantee]-() where not labels(n) set n:Grantee")
session.run("match (n)<-[:ns1__grantee]-() where not labels(n) set n:Grantee")
session.run("match (n:Grantee) set n.name = n.ns0__name")
session.run("match (n:Grantee) where n.name is null and n.ns1__name is not null set n.name = n.ns1__name")
session.run("match (n:Grant) where n.ns0__permission is not null set n.name = n.ns0__permission")
session.run("match (n:Grant) where n.ns1__permission is not null set n.name = n.ns1__permission")
# Set node labels - based on node props
res = session.run('match (n) where n.`rdf:type` is not null return n')
set_of_labels_to_apply = set()
for record in res:
set_of_labels_to_apply.add(record['n']['rdf:type'])
for label in set_of_labels_to_apply:
cypher = 'match (n) where n.`rdf:type`="%s" set n:%s' % (label, label)
session.run(cypher)
# Set node labels - based on relationship props
cypher = 'match ()-[r:`rdf:type`]->(t) return t'
res = session.run(cypher)
set_of_labels_to_apply = set()
for record in res:
set_of_labels_to_apply.add(record['t']['uri'])
for rdf_type in set_of_labels_to_apply:
# Do the updates
# node label just remove anything before colon
label = re.sub('.*:([^:]*)', '\\1', rdf_type)
cypher = 'match (n)-[:`rdf:type`]->(t {uri: "%s"}) set n:%s' % (rdf_type, label)
session.run(cypher)
# Set node names
# from various properties, in descending order of preference
propnames = [
'cloud:name',
'cloud:keyName',
'cloud:id',
'cloud:cloud_id',
'cloud:permission',
'uri',
]
for propname in propnames:
session.run('match (n) where n.`%s` is not null and n.name is null set n.name = n.`%s`' % (propname, propname))
print(' AAAAAAAAAARE YOU READY TO RUUUUUUUUUUMBLE ')
# By this point we hope the user is ready to rumble
def get_all_regions():
ec2 = boto3.client('ec2')
res = ec2.describe_regions()
regions = map(lambda x: x['RegionName'], res['Regions'])
return regions
if '__main__' == __name__:
parser = argparse.ArgumentParser(description='Grab representation of your AWS environment into neo4j')
parser.add_argument('--region', help='limit to one aws region')
parser.add_argument('--infile', '-i', help='one awless n-triple db file to run with')
parser.add_argument('--skip-sync', '-s', action='store_true', help='skip awless sync; operate on local db files only')
parser.add_argument('--only-fix-db', action='store_true', help='only fix db')
parser.add_argument('--verbose', '-v', action='store_true')
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
init_graph()
if args.only_fix_db:
print('i only fix the db!')
fix_db()
import sys;sys.exit(0) # don't tell arjen
if not args.infile:
if args.region:
regions = [args.region]
elif os.environ.get('AWS_TO_NEO4J_LIMIT_REGION'):
regions = [os.environ.get('AWS_TO_NEO4J_LIMIT_REGION')]
else:
regions = get_all_regions()
for region in regions:
if not args.skip_sync:
print 'awless sync for region %s' % region
print subprocess.check_output(['/usr/local/bin/awless', 'sync', '--aws-region=%s' % region])
# Correct all files
corrected_filepaths = []
if args.debug:
awless_database_path = AWLESS_DATABASE_PATH_DEBUG
else:
awless_database_path = AWLESS_DATABASE_PATH
try:
fns = os.listdir(awless_database_path % region)
except OSError:
print "can't load files from %s, skipping ..." % awless_database_path % region
continue
for fn in fns:
if not fn.endswith(CORRECTED_SUFFIX):
fullpath = os.path.join(awless_database_path % region, fn)
corrected_filepaths.append(correct_file(fullpath, region, args.debug))
if args.verbose:
print 'corrected files:'
from pprint import prpint;pprint(corrected_filepaths)
load_to_neo4j(corrected_filepaths)
fix_db()