Skip to content

Commit

Permalink
Merge pull request #4613 from zenoss/bugfix/ZEN-35105.6x
Browse files Browse the repository at this point in the history
Improve zenperfsnmp logging.
  • Loading branch information
jpeacock-zenoss authored Nov 19, 2024
2 parents 9a55d98 + b2add4c commit d4a793a
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 81 deletions.
13 changes: 7 additions & 6 deletions Products/DataCollector/SnmpClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from twisted.internet import reactor, error, defer
from twisted.python import failure
from twisted.internet.error import TimeoutError
from pynetsnmp.twistedsnmp import snmpprotocol, Snmpv3Error
from pynetsnmp.twistedsnmp import snmpprotocol, SnmpUsmError

from Products.ZenEvents import Event
from Products.ZenEvents.ZenEventClasses import Status_Snmp
Expand Down Expand Up @@ -135,9 +135,10 @@ def doRun(self, driver):
self.initSnmpProxy()
else:
raise
except Snmpv3Error:
except SnmpUsmError as ex:
log.error(
"cannot connect to SNMP agent %s", self.connInfo.summary()
"cannot connect to SNMP agent error=%s %s",
ex, self.connInfo.summary()
)
raise
except Exception:
Expand Down Expand Up @@ -277,11 +278,11 @@ def clientFinished(self, result):
)
summary = "SNMP agent down - no response received"
log.info("Sending event: %s", summary)
elif isinstance(result.value, Snmpv3Error):
elif isinstance(result.value, SnmpUsmError):
log.error(
"Connection to device %s failed: %s",
"SNMP connection failed device=%s error=%s",
self.hostname,
result.value.message,
result.value,
)
summary = "SNMP v3 specific error during SNMP collection"
else:
Expand Down
12 changes: 10 additions & 2 deletions Products/ZenCollector/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
#
##############################################################################

from __future__ import division

import logging
import random

from copy import copy

import six

import zope.component
import zope.interface

Expand Down Expand Up @@ -60,7 +64,7 @@ def _delayNextCheck(self):
"""
# If it's not responding, don't poll it so often
if self.interval != self._maxbackoffseconds:
delay = random.randint(int(self.interval / 2), self.interval) * 2
delay = _randomize_delay(self.interval)
self.interval = min(self._maxbackoffseconds, self.interval + delay)
log.debug(
"Delaying next check for another %s",
Expand All @@ -85,7 +89,11 @@ def chunk(self, lst, n):
"""
Break lst into n-sized chunks
"""
return [lst[i : i + n] for i in xrange(0, len(lst), n)]
return [lst[i : i + n] for i in six.moves.range(0, len(lst), n)]


def _randomize_delay(interval):
return random.randint(interval // 2, interval) * 2 # noqa: S311


@zope.interface.implementer(ITaskSplitter)
Expand Down
2 changes: 1 addition & 1 deletion Products/ZenEvents/zentrap/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def task(self):
try:
service = yield self._app.getRemoteConfigServiceProxy()
users = yield service.callRemote("createAllUsers")
diffs = tuple(user for user in users if user not in self._users)
diffs = tuple(u for u in users if u not in self._users)
if diffs:
log.debug(
"received %d new/updated user%s",
Expand Down
20 changes: 9 additions & 11 deletions Products/ZenHub/services/SnmpTrapConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,23 @@ class FakeDevice(object):
class User(UsmUser, pb.Copyable, pb.RemoteCopy):
def getStateToCopy(self):
state = pb.Copyable.getStateToCopy(self)
if self.auth:
if self.auth is not None:
state["auth"] = [self.auth.protocol.name, self.auth.passphrase]
else:
state["auth"] = None
if self.priv:
if self.priv is not None:
state["priv"] = [self.priv.protocol.name, self.priv.passphrase]
else:
state["priv"] = None
return state

def setCopyableState(self, state):
auth = state.get("auth")
state["auth"] = Authentication(*auth) if auth else None
priv = state.get("priv")
state["priv"] = Privacy(*priv) if priv else None
auth_args = state.get("auth")
state["auth"] = Authentication(*auth_args) if auth_args else None
priv_args = state.get("priv")
state["priv"] = Privacy(*priv_args) if priv_args else None
pb.RemoteCopy.setCopyableState(self, state)

pass


pb.setUnjellyableForClass(User, User)

Expand All @@ -80,14 +78,14 @@ def remote_createAllUsers(self):
"Products.ZenModel.DeviceClass.DeviceClass",
)
)
users = []
users = set()
for brain in brains:
device = brain.getObject()
user = self._create_user(device)
if user is not None:
users.append(user)
users.add(user)
log.debug("SnmpTrapConfig.remote_createAllUsers %s users", len(users))
return users
return list(users)

def remote_getTrapFilters(self, remoteCheckSum):
currentCheckSum = md5(self.zem.trapFilters).hexdigest() # noqa S324
Expand Down
129 changes: 73 additions & 56 deletions Products/ZenRRD/zenperfsnmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@

import zope.interface

from pynetsnmp.netsnmp import SnmpTimeoutError, SnmpError
from pynetsnmp.twistedsnmp import snmpprotocol, Snmpv3Error
from pynetsnmp import oids
from pynetsnmp.netsnmp import SnmpTimeoutError, NetSnmpError
from pynetsnmp.twistedsnmp import snmpprotocol
from pynetsnmp.errors import SnmpUsmError, SnmpUsmStatsError
from twisted.internet import defer, error

from Products.ZenCollector.daemon import CollectorDaemon
Expand Down Expand Up @@ -133,6 +135,17 @@ class StopTask(Exception):

STATUS_EVENT = {"eventClass": Status_Snmp, "eventGroup": "SnmpTest"}

_usm_stat_message = {
oids.WrongDigest: (
"invalid zSnmpAuthPassphrase/zSnmpAuthProtocol properties"
),
oids.DecryptionError: (
"invalid zSnmpPrivPassphrase/zSnmpPrivProtocol properties"
),
oids.UnknownUserName: "invalid zSnmpSecurityName property",
oids.UnknownSecurityLevel: "invalid zSnmp* properties",
}


@zope.interface.implementer(IScheduledTask)
class SnmpPerformanceCollectionTask(BaseTask):
Expand Down Expand Up @@ -367,7 +380,7 @@ def _fetchPerfChunk(self, oid_chunk):
self._snmpConnInfo.zSnmpTimeout,
self._snmpConnInfo.zSnmpTries,
)
except (error.TimeoutError, SnmpTimeoutError):
except (error.TimeoutError, SnmpTimeoutError, SnmpUsmStatsError):
raise
except Exception as e:
log.exception(
Expand All @@ -393,31 +406,19 @@ def _fetchPerfChunk(self, oid_chunk):
# remove them from good oids. These will run in single mode so
# we can figure out which ones are good or bad.
if len(oid_chunk) == 1:
self.remove_from_good_oids(oid_chunk)
self._addBadOids(oid_chunk)
log.warn(
"No return result, marking as bad oid: {%s} {%s}",
self.configId,
oid_chunk,
)
self._mark_bad_oids(oid_chunk)
else:
log.warn(
"No return result, will run in separately to determine "
"which oids are valid: {%s} {%s}",
log.warning(
"No results returned, will run in separately to "
"determine which oids are valid device=%s oids=%s",
self.configId,
oid_chunk,
)
self.remove_from_good_oids(oid_chunk)
self._remove_from_good_oids(oid_chunk)
else:
for oid in oid_chunk:
if oid not in update:
log.error(
"SNMP get did not return result: %s %s",
self.configId,
oid,
)
self.remove_from_good_oids([oid])
self._addBadOids([oid])
self._mark_bad_oids([oid])
self.state = SnmpPerformanceCollectionTask.STATE_STORE_PERF
try:
for oid, value in update.items():
Expand All @@ -432,12 +433,7 @@ def _fetchPerfChunk(self, oid_chunk):
# We should always get something useful back
if value == "" or value is None:
if oid not in self._bad_oids:
log.error(
"SNMP get returned empty value: %s %s",
self.configId,
oid,
)
self._addBadOids([oid])
self._mark_bad_oids([oid])
continue

self._good_oids.add(oid)
Expand Down Expand Up @@ -541,7 +537,7 @@ def _doCollectOids(self, ignored):
except StopTask as e:
taskStopped = True
self._stoppedTaskCount += 1
log.warn(
log.warning(
"Device %s [%s] Task stopped collecting to avoid "
"exceeding cycle interval - %s",
self._devId,
Expand All @@ -551,12 +547,20 @@ def _doCollectOids(self, ignored):
self._logOidsNotCollected(
"Task was stopped so as not exceed cycle interval"
)
except (error.TimeoutError, SnmpTimeoutError) as e:
except (error.TimeoutError, SnmpTimeoutError):
log.debug(
"Device %s [%s] snmp timed out ",
self._devId,
self._manageIp,
)
except SnmpUsmStatsError as ex:
message = _usm_stat_message.get(ex.oid)
if not message:
# The UnknownSecurityLevel message also works as a
# generic USM stats error message.
message = _usm_stat_message.get(oids.UnknownSecurityLevel)
log.error("%s device=%s", message, self._devId)
raise

if self._snmpConnInfo.zSnmpVer == "v3":
self._sendStatusEvent(
Expand Down Expand Up @@ -636,7 +640,7 @@ def _doCollectOids(self, ignored):
)
except CycleExceeded as e:
self._cycleExceededCount += 1
log.warn(
log.warning(
"Device %s [%s] scan stopped because time exceeded "
"cycle interval, %s",
self._devId,
Expand All @@ -648,44 +652,57 @@ def _doCollectOids(self, ignored):
"Scan stopped; Collection time exceeded interval - %s" % e,
eventKey="interval_exceeded",
)
except Snmpv3Error as e:
except SnmpUsmError as e:
self._logOidsNotCollected("of %s" % (e,))
self._snmpV3ErrorCount += 1
summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format(
self, e
log.error(
"cannot connect to SNMP agent device=%s error=%s",
self.configId,
e,
)
self._sendStatusEvent(
"Cannot connect to SNMP agent on {0._devId}: {1}".format(
self, e
),
eventKey="snmp_v3_error",
)
log.error("%s on %s", summary, self.configId)
self._sendStatusEvent(summary, eventKey="snmp_v3_error")
except SnmpError as e:
except NetSnmpError as e:
self._logOidsNotCollected("of %s" % (e,))
summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format(
self, e
log.error(
"cannot connect to SNMP agent device=%s error=%s",
self.configId,
e,
)
self._sendStatusEvent(
"Cannot connect to SNMP agent on {0._devId}: {1}".format(
self, e
),
eventKey="snmp_error",
)
log.error("%s on %s", summary, self.configId)
self._sendStatusEvent(summary, eventKey="snmp_error")
finally:
self._logTaskOidInfo(previous_bad_oids)

def remove_from_good_oids(self, oids):
def _remove_from_good_oids(self, oids):
self._good_oids.difference_update(oids)

def _addBadOids(self, oids):
def _mark_bad_oids(self, oids):
"""
Report any bad OIDs and then track the OID so we
don't generate any further errors.
"""
# make sure oids aren't in good set
self.remove_from_good_oids(oids)
self._remove_from_good_oids(oids)
for oid in oids:
if oid in self._oids:
self._bad_oids.add(oid)
names = [dp[0] for dp in self._oids[oid]]
summary = "Error reading value for %s (%s) on %s" % (
names,
oid,
self._devId,
)
log.warn(summary)
if oid in self._bad_oids or oid not in self._oids:
continue
self._bad_oids.add(oid)
names = [dp[0] for dp in self._oids[oid]]
log.warning(
"no result for oid device=%s oid=%s names=%s",
self._devId,
oid,
names,
)

def _finished(self, result):
"""
Expand All @@ -698,12 +715,12 @@ def _finished(self, result):
try:
self._close()
except Exception as ex:
log.warn("Failed to close device %s: error %s", self._devId, ex)
log.warning("Failed to close device %s: error %s", self._devId, ex)

doTask_end = datetime.now()
duration = doTask_end - self._doTask_start
if duration > timedelta(seconds=self._device.cycleInterval):
log.warn(
log.warning(
"Collection for %s took %s seconds; "
"cycle interval is %s seconds.",
self.configId,
Expand Down Expand Up @@ -793,7 +810,7 @@ def _logOidsNotCollected(self, reason):
oidsNotCollected,
)

def _connect(self, result=None):
def _connect(self, ignored=None):
"""
Create a connection to the remote device
"""
Expand All @@ -813,7 +830,7 @@ def _connect(self, result=None):
severity=Event.Clear,
)
except Exception as ex:
self._snmpProxy = None
self._close()
log.error("failed to create SNMP session: %s", ex)
self._sendStatusEvent(
"SNMP config error: {}".format(ex),
Expand Down
Loading

0 comments on commit d4a793a

Please sign in to comment.