-
Notifications
You must be signed in to change notification settings - Fork 16
/
vuln_scorer.py
393 lines (327 loc) · 13.1 KB
/
vuln_scorer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import os
import argparse
import pickle
import editdistance
import datetime
import itertools
import resolver
import common.logger
import common.connect
import gplus.core
import facebook.core
import twitter.core
import linkedin.core
import gplus.connect
import facebook.connect
import twitter.connect
import linkedin.connect
import gplus.search
import facebook.search
import twitter.search
import linkedin.search
import gplus.downloader
import facebook.downloader
import twitter.downloader
import linkedin.downloader
import gplus.analyser
import facebook.analyser
import twitter.analyser
import linkedin.analyser
import web.orgProfileMiner
#Handle arguments
#- /run_name/ will be used to store the downloaded information,
#- /url/ is the main web URL of the organisation targeted
#See README for information on keys.
parser = argparse.ArgumentParser(description='Sample profiles linked from Google+.')
parser.add_argument('run_name', help='The name to use for the run and its output files.')
parser.add_argument('url', help='The url of the company website')
parser.add_argument('--gk', help='The keyfile containing one or more Google+ access keys.')
parser.add_argument('--fk', help='The keyfile containing one or more Facebook access keys.')
parser.add_argument('--tk', help='The keyfile containing one or more Twitter access key sets.')
parser.add_argument('--lk', help='The keyfile containing one or more LinkedIn access key sets.')
args = parser.parse_args()
#Initialise logger
logger = common.logger.getLogger('sampling-tool', output=args.run_name+'.log', level='info')
logger.info('Logger initialised.')
#Initialise centralised store
#This is where affiliate profiles will be stored.
resdirname = 'results'
prefix = resdirname+os.sep+args.run_name+os.sep
if not os.path.isdir(prefix):
os.makedirs(prefix)
db_file = prefix+'db.csv'
logger.info('Database is {}'.format(db_file))
profilestore = common.profilestore.ProfileStore(db_file, logger)
rawdir = prefix+'raw'
profdir = prefix+'profiles'
#Mine website
webdir = prefix+'web'
logger.info('Web store is {}'.format(webdir))
webminer = web.orgProfileMiner.WebOrg.fromURL(args.url)
webminer.getCache(webdir)
logger.info('Website cached.')
#Get social media links referring to the org.
alternates = webminer.getAlternates()
logger.info('Social profiles are {}. Filling seed database.'.format(alternates))
#Prepare seed storage
#This is where the organisation's own profiles will be gathered.
seedpsfile = prefix+'seed-db.csv'
seedrawdir = prefix+'seed-raw'
seedprofdir= prefix+'seed-profiles'
seedps = common.profilestore.ProfileStore(seedpsfile, logger=logger)
#Prime connection handlers.
gpconn = None
fbconn = None
twconn = None
liconn = None
downloaders = []
analysers = []
if args.gk:
gpconn = common.connect.PooledConnection(args.gk, gplus.connect.GoogleConnection, logger)
gpdown = gplus.downloader.GplusDownloader(seedps, gpconn, logger)
gpanal = gplus.analyser.GplusAnalyser(seedps, logger=logger)
downloaders.append(gpdown)
analysers.append(gpanal)
if args.fk:
fbconn = common.connect.PooledConnection(args.fk, facebook.connect.FacebookConnection, logger)
fbdown = facebook.downloader.FacebookDownloader(seedps, fbconn, logger=logger)
fbanal = facebook.analyser.FacebookAnalyser(seedps, logger=logger)
downloaders.append(fbdown)
analysers.append(fbanal)
if args.tk:
twconn = common.connect.PooledConnection(args.tk, twitter.connect.TwitterConnection, logger)
twdown = twitter.downloader.TwitterDownloader(seedps, twconn, logger=logger, include_connections=True)
twanal = twitter.analyser.TwitterAnalyser(seedps, logger=logger)
downloaders.append(twdown)
analysers.append(twanal)
if args.lk:
liconn = common.connect.PooledConnection(args.lk, linkedin.connect.LinkedInConnection, logger)
lidown = linkedin.downloader.LinkedInDownloader(seedps, liconn, logger=logger)
lianal = linkedin.analyser.LinkedInAnalyser(seedps, logger=logger)
downloaders.append(lidown)
analysers.append(twanal)
#Create seed records
for url in alternates:
record = {}
record['url'] = url
record['search_term'] = None
if gpconn and gplus.core.is_valid_result(gpdown, url):
record['network'] = 'Google+'
record['network_id'] = gplus.core.get_net_id(gpdown, url)
seedps.add_record(record)
if twconn and twitter.core.is_valid_result(twdown, url):
record['network'] = 'Twitter'
record['network_id'] = twitter.core.get_net_id(twdown, url)
seedps.add_record(record)
if fbconn and facebook.core.is_valid_result(fbdown, url):
record['network'] = 'Facebook'
record['network_id'] = facebook.core.get_net_id(fbdown, url)
seedps.add_record(record)
if liconn and linkedin.core.is_valid_result(lidown, url):
record['network'] = 'LinkedIn'
record['network_id'] = linkedin.core.get_net_id(lidown, url)
seedps.add_record(record)
#Download seed profiles
logger.info('Beginning seed download phase')
for d in downloaders:
d.run(dirpath=seedrawdir)
logger.info('Beginning seed analysis phase')
for a in analysers:
a.run(indirpath=seedrawdir, outdirpath=seedprofdir)
neturls = {"Twitter":"http://www.twitter.com/{}", "Facebook":"http://www.facebook.com/{}", "LinkedIn":"http://www.linkedin.com/{}", "Google+":"http://plus.google.com/{}"}
#Transfer affiliated profiles to central store.
for f in os.listdir(seedprofdir):
p = pickle.load(open(seedprofdir+os.sep+f,'rb'))
idlist = [u.uid for u in p.interacted + p.followers + p.followed_by + p.grouped]
idlist = list(set(idlist))
for uid in idlist:
record = {}
record['network'] = p.network
record['network_id'] = uid
record['url'] = neturls[p.network].format(str(uid))
record['search_term'] = p.uid
profilestore.add_record(record)
#Download affiliate profiles
#(change of profilestore from prior, but re-use connections).
logger.info('Beginning affiliate download phase')
for d in downloaders:
d.profilestore = profilestore
d.run(dirpath=rawdir)
logger.info('Beginning affiliate analysis phase')
for a in analysers:
a.profilestore = profilestore
a.run(indirpath=rawdir, outdirpath=profdir)
logger.info('End of affiliate information gathering')
def string_sim(n1, n2):
""" Applies Levenshtein distance between strings."""
if (not n1) or (not n2):
return 0
l1 = len(n1)
l2 = len(n2)
diff = editdistance.eval(n1,n2)
return 1-(diff/(l1 if l1 > l2 else l2))
def is_employee(profile, employer_profiles, employee_names, employer_names):
""" Uses a decision-tree derived algorithm to decide if a profile is an
employee of the organisation represented. """
onsite = False
for name in profile.names:
for ename in employee_names:
nsim = string_sim(name, ename)
if nsim > 0.8:
onsite = True
hasfirmname = False
for desc in profile.self_descriptions + [profile.occupation]:
if not desc:
continue
for name in employer_names:
if name in desc:
hasfirmname = True
employer_followed_by = [f.uid for employer_profile in employer_profiles for f in employer_profile.followed_by]
employer_followers = [f.uid for employer_profile in employer_profiles for f in employer_profile.followers]
count_followed_by = len([1 for f in profile.followed_by if f.uid in employer_followed_by])
count_followers = len([1 for f in profile.followers if f.uid in employer_followers])
if onsite:
print("Onsite")
if hasfirmname:
return True
else:
if count_followed_by <= 2:
if count_followers <= 1 or count_followed_by > 1:
return True
return False
logger.info('Identifying Employees')
#Extract the possible names of the company.
employer_names = webminer.getNames()
#Extract the names from the web page content.
#NOTE: REQUIRES the stanford NER package to be running as a server.
employee_names = webminer.getTargets()
employer_profiles = []
employee_profiles = []
#Load all the employer profiles
for record in seedps.records:
fname = seedprofdir+os.sep+str(record['uid'])+'.pickle'
if os.path.exists(fname):
profile = pickle.load(open(fname,'rb'))
employer_profiles.append(profile)
#Identify employees
for record in profilestore.records:
fname = profdir+os.sep+str(record['uid'])+'.pickle'
if os.path.exists(fname):
profile = pickle.load(open(fname,'rb'))
if is_employee(profile, employer_profiles, employee_names, employer_names):
employee_profiles.append(profile)
logger.info('Identified {} employees.'.format(len(employee_profiles)))
#Extract the names of the employees, to use as search terms for expansion.
namesfile = prefix+'names.txt'
confirmed_names = []
for profile in employee_profiles:
bname = profile.bestname()
if not bname.isnumeric():
confirmed_names.append(bname)
confirmed_names = list(set(confirmed_names))
wf = open(namesfile,'w')
for name in confirmed_names:
wf.write('{}\n'.format(name))
wf.close()
logger.info('Wrote {} names to {}'.format(len(confirmed_names), namesfile))
#Set up to download search results.
logger.info('Initialising expansion DB.')
exp_db = prefix+'expand-db.csv'
exprawdir = prefix+'expand-raw'
expprofdir = prefix+'expand-profiles'
expstore = common.profilestore.ProfileStore(exp_db, logger)
searchers = []
if fbconn:
searchers.append(facebook.search.FacebookSearch(fbconn, expstore, logger))
if gpconn:
searchers.append(gplus.search.GPlusSearch(gpconn, expstore, logger))
if twconn:
searchers.append(twitter.search.TwitterSearch(twconn, expstore, logger))
if liconn:
searchers.append(linkedin.search.LinkedInSearch(expstore, logger))
#Download search results
logger.info('Beginning expansion search phase')
for s in searchers:
s.search_all(namesfile)
logger.info('Beginning expansion download phase')
for d in downloaders:
d.profilestore = expstore
d.run(dirpath=exprawdir)
logger.info('Beginning expansion analysis phase')
for a in analysers:
a.profilestore = expstore
a.run(indirpath=exprawdir, outdirpath=expprofdir)
#Match the employees to any of the expansion results.
logger.info('Resolving identities')
matches = resolver.cross_resolve(employee_profiles, prefix)
matches = resolver.dictify(matches)
def structtodatetime(struct):
""" Transforms a struct_time to a datetime object.
This is done so timedeltas can be constructed.
:param struct_time struct: A time struct
:return: A datetime object converted from the input. """
return datetime.datetime(struct.tm_year, struct.tm_mon, struct.tm_mday, struct.tm_hour, struct.tm_min, struct.tm_sec)
#vulnerability item counting phase
counts = {}
for k in ['name', 'occupation', 'email', 'text', 'photo', 'hobbies', 'friends', 'phone', 'activity','docs']:
counts[k] = 0
logger.info('Calculating per-employee vulnerabilities.')
for employee in employee_profiles:
#array this person's identities
person = [employee]
if employee in matches:
person = person + matches[employee]
#a local scoring grid based on the global one
local_matches = {}
for k in counts:
local_matches[k] = False
#for each profile update the grid
for profile in person:
if not local_matches['name'] and any([not n.isnumeric() for n in employee.names]):
local_matches['name'] = True
if not local_matches['occupation'] and employee.occupation:
local_matches['occupation'] = True
if not local_matches['email'] and len(employee.email_addresses) > 0:
local_matches['email'] = True
if not local_matches['phone'] and len(employee.phone_numbers) > 0:
local_matches['phone'] = True
if not local_matches['text'] and any([ci.ctype == 3 for ci in employee.content]):
local_matches['text'] = True
if not local_matches['activity']:
updatetimes = [content.time for content in employee.content if content.time != None]
samedaycount = 0
for t1,t2 in itertools.combinations(updatetimes,2):
delta = structtodatetime(t1) - structtodatetime(t2)
daydiff = abs(delta.days)
if daydiff == 0:
samedaycount += 1
if samedaycount > 1 and samedaycount/len(updatetimes) > 0.05:
local_matches['activity'] = True
if not local_matches['photo'] and len(employee.profile_images) > 0:
local_matches['photo'] = True
if not local_matches['hobbies'] and employee.habits or employee.tags:
local_matches['hobbies'] = True
if not local_matches['friends'] and len(employee.followers) or len(employee.followed_by):
local_matches['friends'] = True
for k in local_matches:
if local_matches[k]:
counts[k] += 1
logger.info('Finished per-employee counting.')
#output results
sumfile = prefix+'summary.txt'
sumfh = open(sumfile, 'w')
sumfh.write("--Employees Only--\n")
for k,v in counts.items():
sumfh.write("{}:{}\n".format(k,v))
logger.info('Accessing web-mined data')
counts['email'] += len(webminer.getEmail())
counts['phone'] += len(webminer.getPhone())
counts['docs'] += (len(webminer.getDocs()))
logger.info('Finished counting.')
sumfh.write("\n--Total--\n")
for k,v in counts.items():
sumfh.write("{}:{}\n".format(k,v))
bootstrap = counts['name']+counts['email']+counts['phone']+counts['activity']+counts['docs']
accentuator = counts['text']+counts['photo']+counts['hobbies']+counts['friends']
sumfh.write("Bootstrap: {}\nAccentuator: {}\n".format(bootstrap,accentuator))
sumfh.close()