-
Notifications
You must be signed in to change notification settings - Fork 0
/
Crawler.rb
262 lines (204 loc) · 8.56 KB
/
Crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# class implements the operations
class Crawler
def initialize(aObj, lObj, dbObj, id=-1)
# aObj: reference to Alert object
# lObj: reference to Logger object
# dbObj: reference to influxdb object
# id: useful when using multiple RCrawler processes
@objId, @alertObj, @logObj, @influxDBObj = id, aObj, lObj, dbObj
logObj.debug("thrId: #{@objId}, #{aObj.inspect}, #{lObj.inspect}")
@failCounter = 0
@alertSent = false
end
def runPings(opInput)
# run a ping every delay number of seconds
@pingDelay = opInput.interval.to_i # delay, in seconds, between ping commands
@pingDest = opInput.dest.to_s # destination IP address for ping
@pingCounter = 0 # count how many pings have been taken, if -1 go on forever
@pingLimit = opInput.reps.to_i # number of pings to send
@pingDstIsIP = false
begin
unless opInput.is_a?(Operation)
raise "UnexpectedType"
end
rescue => e
puts "runPings(...) is expecting an Operation object"
puts "EXCEPTION, runPings(...): #{e}"
logObj.error("thrId: #{@objId}, runPings(...): #{e}")
return
end
# TODO: does checking for the correct IP address even matter, if I can also put in a hostname?
num = "(\\d|[01]?\\d\\d|2[0-4]\\d|25[0-5])" # TODO: this will consider x.x.xxx as a valid IP address
pat = "^(#{num}\.){3}#{num}$"
ip_pat = Regexp.new(pat)
# check if the input is a valid IP address or a hostname
#puts pingDest
if pingDest =~ ip_pat
pingDstIsIP = true # this should only run if pingDest is a valid IP address
else
pingDstIsIP = false
end
###############################################################################################
# start the pings
# stores a -1 if the ping does not succeeds, i.e. timeout
loop do
tmpLatency = -1.0
tmpTtl = 0
# anything above 5 seconds timeout is really high
# remember that Mac OS has that value in milliseconds, but Linux usually in seconds
pingCmd = "ping #{pingDest.to_s} -c 1 -W 5000"
stdout, stderr, status = Open3.capture3("#{pingCmd}")
# if stderr has contents, something bad happened
if stderr.size == 0
# get second line of output "64 bytes from <ip>: icmp_seq=1 ttl=59 time=13.7 ms"
tmpStr = stdout.split("\n")[1]
# get ttl
index = tmpStr.index("ttl").to_i
pos = index.to_i + "ttl".length.to_i + 1
tmpTtl = tmpStr[pos, tmpStr.length - pos - 3].to_i
# get latency
index = tmpStr.rindex("time").to_i
pos = index.to_i + "time".length.to_i + 1
tmpLatency = tmpStr[pos, tmpStr.length - pos - 3].to_f
tmpTimestamp = DateTime.now.strftime('%Q').to_i
# reset alert variables
@failCounter = 0
alertSent = false
else
@failCounter += 1
logObj.debug("thrId: #{@objId}, #{pingDest.to_s} failCounter=#{@failCounter}, exitstatus=#{status.exitstatus}")
# check if alert needs to be sent
if alertObj.sendAlerts && (@failCounter >= alertObj.maxPingsBeforeAlert.to_i && !alertSent)
# send alert, but only one per-occurrence
alertSent = sendAlert("PING", pingDest.to_s, @failCounter, @objId)
end
end
# store data in influxDB
data = {
values: {latency: tmpLatency, ttl: tmpTtl, exitStatus: status.exitstatus},
timestamp: tmpTimestamp,
}
influxDBObj.write_point("PING_"+pingDest.to_s, data)
# show something in the terminal
pingOutput = objId.to_s + ", " + pingCounter.to_s + ", " + pingDest
pingOutput += ", latency: #{tmpLatency.to_s} ms, ttl: #{tmpTtl.to_s}"
pingOutput += ", code: #{status.exitstatus}"
puts pingOutput
logObj.info("thrId: #{@objId}, #{pingOutput.to_s}")
# increase counters and see if the loop needs to continue
@pingCounter += 1
if pingLimit > 0 && pingCounter >= pingLimit
puts "thrId: #{@objId} has completed #{pingLimit} PING operations, #{pingDest}"
logObj.info("thrId: #{@objId} has completed #{pingLimit} PING operations, #{pingDest}")
break
else
sleep pingDelay
end
end
end
def runHttpQueries(opInput)
# run a get query for the URL every interval number of seconds
# take as input an Operation object containing all the necessary values
# make sure that opInput is an Operation object
begin
unless opInput.is_a?(Operation)
raise "UnexpectedType"
end
rescue => e
puts "runHttpQueries(...) is expecting an Operation object"
puts "EXCEPTION: #{e}"
return
end
@httpUrl = opInput.dest.to_s # this NEEDS to start with either http:// or https://, otherwise it will be
# interpreted as the location of a local file
@httpDelay = opInput.interval.to_i
@httpCounter = 0
@httpLimit = opInput.reps.to_i
@httpFileHeader = opInput.httpFileHeader.to_s # static file headers to put at the top of an output file
# create file to store data to
pos1 = httpUrl.index(":").to_i
fileName = httpUrl[0,pos1] + "_" + httpUrl[pos1+3, httpUrl.length - (pos1+3)]
# start crawling
currEpoch = 0
loop do
startTime = DateTime.now.strftime('%Q').to_s
duration = -1
hValues = {length: -1, code: -1, duration: -1} # if the query failed, save a whole bunch of -1s to file
begin
# queryResponse = open(@httpUrl.to_s)
# puts queryResponse.class
queryResponse = HTTParty.get(@httpUrl.to_s, {timeout: 5})
# response.body, response.code, response.message, response.headers.inspect
# puts "body: #{queryResponse.body}"
# puts "code: #{queryResponse.code}"
# puts "message: #{queryResponse.message}"
# puts "headers: #{queryResponse.headers.class}"
# try to rescue from any exceptions, but keep trying
# TODO: for some reason the rescue clauses below throw runtime errors on Mac OS
rescue Exception => e
# TODO: do some logging before re-raising the exception
puts "#{e}"
logObj.error("thrId: #{@objId}, runHttpQueries() #{e}")
@failCounter += 1
logObj.debug("thrId: #{@objId}, #{httpUrl.to_s} failCounter=#{@failCounter}")
#puts "FAILED, " + "#{failCounter}\t" + "#{alertObj.maxPingsBeforeAlert.to_i}"
if @alertObj.sendAlerts && (@failCounter >= @alertObj.maxHTTPBeforeAlert.to_i && !@alertSent)
# send alert, but only one per-occurrence
@alertSent = sendAlert("HTTP", @httpUrl.to_s, @failCounter, @objId)
end
#raise e # TODO: re-raise exception previously ignored and process properly
else
# no exceptions
@failCounter = 0
currEpoch = DateTime.now.strftime('%Q').to_s # time when query finished
duration = currEpoch.to_i - startTime.to_i # time taken to do the HTTP GET, in milliseconds
hValues = {length: queryResponse.body.length,
code: queryResponse.code,
duration: duration
# message: queryResponse.message,
# headers: queryResponse.headers
}
end
# store data in influxDB
data = {
values: hValues,
timestamp: currEpoch,
}
influxDBObj.write_point("HTTP_"+httpUrl.to_s, data)
# show some output in the terminal
httpOutput = objId.to_s + ", " + httpCounter.to_s + ", " + httpUrl + ", " + duration.to_s + " ms"
httpOutput += ", code: #{hValues[:code]}\tlength: #{hValues[:length]}"
puts httpOutput
logObj.info(httpOutput)
# increase counters and see if the loop needs to continue
@httpCounter += 1
if httpLimit > 0 && httpCounter >= httpLimit
puts "thrId: #{@objId} has completed #{httpLimit} HTTP operations, #{httpUrl}"
logObj.info("thrId: #{@objId} has completed #{httpLimit} HTTP operations, #{httpUrl}")
break
else
sleep httpDelay
end
end
end
def sendAlert(op, dest, failCount, id=-1)
# send email alert using the Alert object passed to the class initialiser
currEpoch = DateTime.now.strftime('%Q') # this so that logs and email alert have the same timestamp
output = @alertObj.sendEmailAlert(op, dest, failCount, currEpoch.to_s, DateTime.strptime(currEpoch,'%Q'))
if output
puts "[#{currEpoch},#{objId.to_s}]: #{op} Alert sent, #{dest}"
@logObj.info("thrId: #{@objId}, #{op} Alert sent, #{dest}")
return true
else
puts "[#{currEpoch},#{objId.to_s}]: #{op} Failed to send email alert, #{dest}"
@logObj.info("thrId: #{@objId}, #{op} Failed to send email alert, #{dest}")
return false
end
end
# accessors for instance variables
attr_accessor :objId, :alertObj, :logObj, :influxDBObj
attr_accessor :pingDelay, :pingDest, :pingCounter, :pingLimit, :pingFileHeader, :pingDstIsIP
attr_accessor :httpUrl, :httpDelay, :httpCounter, :httpLimit, :httpFileHeader
# generic variables
attr_accessor :failCounter, :alertSent
end