-
Notifications
You must be signed in to change notification settings - Fork 0
/
isi_data_insights_d.cfg
331 lines (303 loc) · 12.4 KB
/
isi_data_insights_d.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
[isi_data_insights_d]
# Parameters specified on the command line will supersede the parameters
# in this section.
# pid_file: /var/run/isi_data_insights_d.pid
# log_file: /var/run/isi_data_insights_d.log
# default log_level is INFO
# log_level: DEBUG
stats_processor: influxdb_plugin
# Use "auth" as the 4th arg in order to be prompted for the
# InfluxDB username and password stats_processor_args, example:
# localhost 8086 isi_data_insights auth
stats_processor_args: localhost 8086 isi_data_insights
# clusters in this section are queried for all stat groups
# clusters: [username1:password1@]<ip-or-host-address1>[:True|False]
# [[username2:password2]@<ip-or-host-address2>[:True|False]]
# [[username3:password3]@<ip-or-host-address3>[:True|False]]
# ...
# If you don't specify the username and password then you will be prompted
# for them when the daemon starts up.
# Use the optional True or False on the end to specify whether the cluster's
# SSL certificate should be verified. If it is omitted then the default is
# False (i.e. don't verify SSL cert).
clusters: DataInsights:[email protected]:False
# Specifies the active list of stat groups to query, each stat group name
# specified here should have a corresponding section in the config file.
active_stat_groups: cluster_cpu_stats
cluster_network_traffic_stats
cluster_client_activity_stats
cluster_health_stats
ifs_space_stats
ifs_rate_stats
node_load_stats
node_disk_stats
node_net_stats
cluster_disk_rate_stats
cluster_proto_stats
cache_stats
heat_total_stats
node_proto_stats
node_cpu_stats
node_space_stats
node_ssd_util_stats
node_capacity_rate_stats
node_operation_stats
concurrency_stats
# The min_update_interval_override param provides ability to override the
# minimum interval that the daemon will query for a set of stats. The purpose
# of the minimum interval, which defaults to 30 seconds, is to prevent
# the daemon's queries from putting too much stress on the cluster.
# The default value is 30 seconds.
# min_update_interval_override: 15
[cluster_cpu_stats]
# The clusters (optional) param defines a list of clusters specific to this
# group.
# clusters: 10.25.69.74 10.25.69.75
# update interval is in seconds or use *<number> to base the update interval
# off each stat's collection interval (i.e. *2 == 2 times the collection
# interval, *1 == * == 1 times the collection invterval of each stat)
update_interval: *
stats: cluster.cpu.sys.avg
cluster.cpu.user.avg
cluster.cpu.idle.avg
cluster.cpu.intr.avg
[node_cpu_stats]
update_interval: *
stats: node.cpu.sys.avg
node.cpu.user.avg
node.cpu.idle.avg
node.cpu.intr.avg
[node_proto_stats]
update_interval: *
stats: node.protostats.nfs
node.protostats.smb2
[cluster_network_traffic_stats]
update_interval: *
stats: cluster.net.ext.bytes.in.rate
cluster.net.ext.bytes.out.rate
cluster.net.ext.packets.in.rate
cluster.net.ext.packets.out.rate
cluster.net.ext.errors.in.rate
cluster.net.ext.errors.out.rate
[cluster_client_activity_stats]
update_interval: *
stats: node.clientstats.active.ftp
node.clientstats.active.hdfs
node.clientstats.active.http
node.clientstats.active.lsass_out
node.clientstats.active.jobd
node.clientstats.active.nfs
node.clientstats.active.nfs4
node.clientstats.active.nlm
node.clientstats.active.papi
node.clientstats.active.siq
node.clientstats.active.cifs
node.clientstats.active.smb2
node.clientstats.connected.ftp
node.clientstats.connected.hdfs
node.clientstats.connected.http
node.clientstats.connected.nfs
node.clientstats.connected.nlm
node.clientstats.connected.papi
node.clientstats.connected.siq
node.clientstats.connected.cifs
[cluster_health_stats]
update_interval: *
stats: cluster.health
cluster.node.count.all
cluster.node.count.down
[ifs_space_stats]
update_interval: *
stats: ifs.bytes.avail
ifs.bytes.free
ifs.bytes.used
ifs.bytes.total
ifs.percent.free
ifs.percent.avail
ifs.percent.used
[ifs_rate_stats]
update_interval: *
stats: ifs.bytes.in.rate
ifs.bytes.out.rate
ifs.ops.in.rate
ifs.ops.out.rate
[node_load_stats]
update_interval: *
stats: node.cpu.throttling
node.load.1min
node.load.5min
node.load.15min
node.memory.used
node.memory.free
node.memory.cache
node.open.files
[node_disk_stats]
update_interval: *
stats: node.disk.bytes.out.rate.avg
node.disk.bytes.in.rate.avg
node.disk.busy.avg
node.disk.xfers.out.rate.avg
node.disk.xfers.in.rate.avg
node.disk.xfer.size.out.avg
node.disk.xfer.size.in.avg
node.disk.access.latency.avg
node.disk.access.slow.avg
node.disk.iosched.queue.avg
node.disk.iosched.latency.avg
[node_net_stats]
update_interval: *
stats: node.net.int.bytes.in.rate
node.net.int.bytes.out.rate
node.net.ext.bytes.in.rate
node.net.ext.bytes.out.rate
node.net.int.errors.in.rate
node.net.int.errors.out.rate
node.net.ext.errors.in.rate
node.net.ext.errors.out.rate
[cluster_disk_rate_stats]
update_interval: *
stats: cluster.disk.xfers.rate
cluster.disk.xfers.in.rate
cluster.disk.xfers.out.rate
cluster.disk.bytes.in.rate
cluster.disk.bytes.out.rate
[cluster_proto_stats]
update_interval: *
stats: cluster.protostats.nfs
cluster.protostats.nlm
cluster.protostats.cifs
cluster.protostats.ftp
cluster.protostats.http
cluster.protostats.siq
cluster.protostats.jobd
cluster.protostats.smb2
cluster.protostats.nfs4
cluster.protostats.irp
cluster.protostats.lsass_in
cluster.protostats.lsass_out
cluster.protostats.papi
cluster.protostats.hdfs
cluster.protostats.nfs.total
cluster.protostats.nlm.total
cluster.protostats.cifs.total
cluster.protostats.ftp.total
cluster.protostats.http.total
cluster.protostats.siq.total
cluster.protostats.jobd.total
cluster.protostats.smb2.total
cluster.protostats.nfs4.total
cluster.protostats.irp.total
cluster.protostats.lsass_in.total
cluster.protostats.lsass_out.total
cluster.protostats.papi.total
cluster.protostats.hdfs.total
[cache_stats]
update_interval: *
stats: node.ifs.cache
[heat_total_stats]
update_interval: *
stats: node.ifs.heat.lock.total
node.ifs.heat.blocked.total
node.ifs.heat.contended.total
node.ifs.heat.deadlocked.total
node.ifs.heat.write.total
node.ifs.heat.read.total
node.ifs.heat.lookup.total
node.ifs.heat.rename.total
node.ifs.heat.link.total
node.ifs.heat.unlink.total
node.ifs.heat.getattr.total
node.ifs.heat.setattr.total
[node_space_stats]
update_interval: 60
stats: node.ifs.bytes.free
node.ifs.bytes.used
node.ifs.bytes.total
[node_ssd_util_stats]
update_interval: 60
stats: node.ifs.ssd.bytes.free
node.ifs.ssd.bytes.used
node.ifs.ssd.bytes.total
[node_capacity_rate_stats]
update_interval: 60
stats: node.ifs.bytes.deleted
node.ifs.bytes.in
node.ifs.bytes.out
node.ifs.bytes.deleted.rate
node.ifs.bytes.in.rate
node.ifs.bytes.out.rate
node.ifs.bytes.in.rate.max
node.ifs.bytes.out.rate.max
[node_operation_stats]
update_interval: 60
stats: node.ifs.files.created
node.ifs.files.removed
node.ifs.num.lookups
node.ifs.files.created.rate
node.ifs.files.removed.rate
node.ifs.num.lookups.rate
node.ifs.ops.in
node.ifs.ops.out
node.ifs.ops.in.rate
node.ifs.ops.out.rate
# These stats are not currently active by default. They are here to serve as an example of how to use the
# derived stats functionality. See the comments below for more details.
[concurrency_stats]
update_interval: *
stats: node.ifs.ops.in node.ifs.ops.out node.disk.iosched.latency.avg
cluster.protostats.nfs.total
cluster.protostats.nfs.total
cluster.protostats.smb2.total
cluster.protostats.nlm.total
cluster.protostats.cifs.total
cluster.protostats.http.total
cluster.protostats.siq.total
cluster.protostats.nfs4.total
cluster.protostats.hdfs.total
cluster.protostats.ftp.total
# The composite_stats, equation_stats, percent_change_stats, final_equation_stats sections allow you to
# specify new stats that are derived from the values of other stats. You can derive stats from base stats
# or even specific fields or indices within a base stat's value, which is actually required if the
# base stat's value is not a float or integer (i.e. it is a dict or list). See below for more
# info on each type of derived stat.
#### Composite Stats Description #####
# The composite_stats parameter specifies a list of node specific stats (i.e. stats whose name
# start with "node.") where each stat is composited across the entire cluster using the specified
# operation. Supported operations at this time are avg, max, min, and sum.
# The output name of a composite_stat is: cluster.<name of original stat>.[<field1>[...<fieldN>]].<name of operation>,
# so for the three stats above it would be cluster.node.ifs.ops.in.sum,
# cluster.node.ifs.ops.out.sum, and cluster.node.disk.iosched.latency.avg.avg. If the base stat
# contains one of more fields then those are appended to the name with '.' as delimiter, e.g.:
# sum(node.protostats.nfs.total:op_count) -> cluster.node.protostats.nfs.total.op_count.sum
composite_stats: sum(node.ifs.ops.in) sum(node.ifs.ops.out) avg(node.disk.iosched.latency.avg)
#### Equation Stats Description #####
# The equation_stats parameter specifies a list of output stat names for stats that will be
# derived from an equation that takes as input either base stat values or composite_stats values.
# The equation for each equation stat is specified in a parameter named the same as the equation
# stat.
equation_stats: cluster.ifs.concurrency cluster.protostats.all.total.op_count cluster.protostats.all.total.time_avg
# This is the definition of the equation used to compute the the cluster.ifs.concurrency stat.
# Any of the base stats or any composite stat can be used in the equation expression. Any
# expression supported by the Equation package of Python can be used:
# https://pypi.python.org/pypi/Equation
cluster.ifs.concurrency: (cluster.node.ifs.ops.in.sum + cluster.node.ifs.ops.out.sum) * cluster.node.disk.iosched.latency.avg.avg
# The cluster.protostats.all.total.op_count is a sum of all 9 of the different protocols' op_count.
# This equation shows an example of how to select a specific field within a stat that returns a dict, in this case the op_count
# field. Note that some stats are returned as list with always only a single dict item - in those cases the value is treated
# as if it was just a dict. Otherwise, to index into a list you would use numeric field names after the colon. Multiple field
# names or list indices are allowed (i.e. node.example.stat:field1:field2:field3...).
cluster.protostats.all.total.op_count: cluster.protostats.nfs.total:op_count + cluster.protostats.nfs.total:op_count + cluster.protostats.smb2.total:op_count + cluster.protostats.nlm.total:op_count + cluster.protostats.cifs.total:op_count + cluster.protostats.http.total:op_count + cluster.protostats.siq.total:op_count + cluster.protostats.nfs4.total:op_count + cluster.protostats.hdfs.total:op_count + cluster.protostats.ftp.total:op_count
# This stat computes the sum of the time_avg field and then takes an average.
cluster.protostats.all.total.time_avg: (cluster.protostats.nfs.total:time_avg + cluster.protostats.nfs.total:time_avg + cluster.protostats.smb2.total:time_avg + cluster.protostats.nlm.total:time_avg + cluster.protostats.cifs.total:time_avg + cluster.protostats.http.total:time_avg + cluster.protostats.siq.total:time_avg + cluster.protostats.nfs4.total:time_avg + cluster.protostats.hdfs.total:time_avg + cluster.protostats.ftp.total:time_avg) / 10.0
#### Percent Change Stats Description #####
# The percent_change_stats section specifies a list of base stats, composite stats, and/or equation
# stats whose percent change from one measurement to the next will be stored in a new stat whose
# name will be <name of original stat>.percentchange
percent_change_stats: cluster.node.disk.iosched.latency.avg.avg cluster.protostats.all.total.time_avg
#### Final Equation Stats Description #####
# The final_equation_stats is the same as the equation_stats section except these equations have access to base stats and all of the previously
# defined derived stats as input. Again list the names of the output stats and then list the equation for each output stat in section of that same
# name.
final_equation_stats: cluster.ifs.concurrency.importance
# Definition of the cluster.ifs.concurrency.importance final equation stat
cluster.ifs.concurrency.importance: (cluster.protostats.all.total.op_count * cluster.protostats.all.total.time_avg) * cluster.node.disk.iosched.latency.avg.avg.percentchange