From b2add4c8e8d00f45655c1099499c3ec5f66bdddd Mon Sep 17 00:00:00 2001 From: Jason Peacock Date: Thu, 14 Nov 2024 08:29:58 -0600 Subject: [PATCH] Improve zenperfsnmp logging. Also updated code to support latest pynetsnmp changes. ZEN-35105 --- Products/DataCollector/SnmpClient.py | 13 ++- Products/ZenCollector/tasks.py | 12 +- Products/ZenEvents/zentrap/users.py | 2 +- Products/ZenHub/services/SnmpTrapConfig.py | 20 ++-- Products/ZenRRD/zenperfsnmp.py | 129 ++++++++++++--------- Products/ZenRRD/zenprocess.py | 10 +- 6 files changed, 105 insertions(+), 81 deletions(-) diff --git a/Products/DataCollector/SnmpClient.py b/Products/DataCollector/SnmpClient.py index 5e9fe71632..1696d2a4d0 100644 --- a/Products/DataCollector/SnmpClient.py +++ b/Products/DataCollector/SnmpClient.py @@ -15,7 +15,7 @@ from twisted.internet import reactor, error, defer from twisted.python import failure from twisted.internet.error import TimeoutError -from pynetsnmp.twistedsnmp import snmpprotocol, Snmpv3Error +from pynetsnmp.twistedsnmp import snmpprotocol, SnmpUsmError from Products.ZenEvents import Event from Products.ZenEvents.ZenEventClasses import Status_Snmp @@ -135,9 +135,10 @@ def doRun(self, driver): self.initSnmpProxy() else: raise - except Snmpv3Error: + except SnmpUsmError as ex: log.error( - "cannot connect to SNMP agent %s", self.connInfo.summary() + "cannot connect to SNMP agent error=%s %s", + ex, self.connInfo.summary() ) raise except Exception: @@ -277,11 +278,11 @@ def clientFinished(self, result): ) summary = "SNMP agent down - no response received" log.info("Sending event: %s", summary) - elif isinstance(result.value, Snmpv3Error): + elif isinstance(result.value, SnmpUsmError): log.error( - "Connection to device %s failed: %s", + "SNMP connection failed device=%s error=%s", self.hostname, - result.value.message, + result.value, ) summary = "SNMP v3 specific error during SNMP collection" else: diff --git a/Products/ZenCollector/tasks.py b/Products/ZenCollector/tasks.py index 5350c5c1eb..89695174cd 100644 --- a/Products/ZenCollector/tasks.py +++ b/Products/ZenCollector/tasks.py @@ -7,11 +7,15 @@ # ############################################################################## +from __future__ import division + import logging import random from copy import copy +import six + import zope.component import zope.interface @@ -60,7 +64,7 @@ def _delayNextCheck(self): """ # If it's not responding, don't poll it so often if self.interval != self._maxbackoffseconds: - delay = random.randint(int(self.interval / 2), self.interval) * 2 + delay = _randomize_delay(self.interval) self.interval = min(self._maxbackoffseconds, self.interval + delay) log.debug( "Delaying next check for another %s", @@ -85,7 +89,11 @@ def chunk(self, lst, n): """ Break lst into n-sized chunks """ - return [lst[i : i + n] for i in xrange(0, len(lst), n)] + return [lst[i : i + n] for i in six.moves.range(0, len(lst), n)] + + +def _randomize_delay(interval): + return random.randint(interval // 2, interval) * 2 # noqa: S311 @zope.interface.implementer(ITaskSplitter) diff --git a/Products/ZenEvents/zentrap/users.py b/Products/ZenEvents/zentrap/users.py index 91b6146aca..a78795853e 100644 --- a/Products/ZenEvents/zentrap/users.py +++ b/Products/ZenEvents/zentrap/users.py @@ -31,7 +31,7 @@ def task(self): try: service = yield self._app.getRemoteConfigServiceProxy() users = yield service.callRemote("createAllUsers") - diffs = tuple(user for user in users if user not in self._users) + diffs = tuple(u for u in users if u not in self._users) if diffs: log.debug( "received %d new/updated user%s", diff --git a/Products/ZenHub/services/SnmpTrapConfig.py b/Products/ZenHub/services/SnmpTrapConfig.py index e51dc4e62e..d7e7c59e76 100644 --- a/Products/ZenHub/services/SnmpTrapConfig.py +++ b/Products/ZenHub/services/SnmpTrapConfig.py @@ -44,25 +44,23 @@ class FakeDevice(object): class User(UsmUser, pb.Copyable, pb.RemoteCopy): def getStateToCopy(self): state = pb.Copyable.getStateToCopy(self) - if self.auth: + if self.auth is not None: state["auth"] = [self.auth.protocol.name, self.auth.passphrase] else: state["auth"] = None - if self.priv: + if self.priv is not None: state["priv"] = [self.priv.protocol.name, self.priv.passphrase] else: state["priv"] = None return state def setCopyableState(self, state): - auth = state.get("auth") - state["auth"] = Authentication(*auth) if auth else None - priv = state.get("priv") - state["priv"] = Privacy(*priv) if priv else None + auth_args = state.get("auth") + state["auth"] = Authentication(*auth_args) if auth_args else None + priv_args = state.get("priv") + state["priv"] = Privacy(*priv_args) if priv_args else None pb.RemoteCopy.setCopyableState(self, state) - pass - pb.setUnjellyableForClass(User, User) @@ -80,14 +78,14 @@ def remote_createAllUsers(self): "Products.ZenModel.DeviceClass.DeviceClass", ) ) - users = [] + users = set() for brain in brains: device = brain.getObject() user = self._create_user(device) if user is not None: - users.append(user) + users.add(user) log.debug("SnmpTrapConfig.remote_createAllUsers %s users", len(users)) - return users + return list(users) def remote_getTrapFilters(self, remoteCheckSum): currentCheckSum = md5(self.zem.trapFilters).hexdigest() # noqa S324 diff --git a/Products/ZenRRD/zenperfsnmp.py b/Products/ZenRRD/zenperfsnmp.py index 5b7e905962..874f2f5b07 100755 --- a/Products/ZenRRD/zenperfsnmp.py +++ b/Products/ZenRRD/zenperfsnmp.py @@ -21,8 +21,10 @@ import zope.interface -from pynetsnmp.netsnmp import SnmpTimeoutError, SnmpError -from pynetsnmp.twistedsnmp import snmpprotocol, Snmpv3Error +from pynetsnmp import oids +from pynetsnmp.netsnmp import SnmpTimeoutError, NetSnmpError +from pynetsnmp.twistedsnmp import snmpprotocol +from pynetsnmp.errors import SnmpUsmError, SnmpUsmStatsError from twisted.internet import defer, error from Products.ZenCollector.daemon import CollectorDaemon @@ -133,6 +135,17 @@ class StopTask(Exception): STATUS_EVENT = {"eventClass": Status_Snmp, "eventGroup": "SnmpTest"} +_usm_stat_message = { + oids.WrongDigest: ( + "invalid zSnmpAuthPassphrase/zSnmpAuthProtocol properties" + ), + oids.DecryptionError: ( + "invalid zSnmpPrivPassphrase/zSnmpPrivProtocol properties" + ), + oids.UnknownUserName: "invalid zSnmpSecurityName property", + oids.UnknownSecurityLevel: "invalid zSnmp* properties", +} + @zope.interface.implementer(IScheduledTask) class SnmpPerformanceCollectionTask(BaseTask): @@ -367,7 +380,7 @@ def _fetchPerfChunk(self, oid_chunk): self._snmpConnInfo.zSnmpTimeout, self._snmpConnInfo.zSnmpTries, ) - except (error.TimeoutError, SnmpTimeoutError): + except (error.TimeoutError, SnmpTimeoutError, SnmpUsmStatsError): raise except Exception as e: log.exception( @@ -393,31 +406,19 @@ def _fetchPerfChunk(self, oid_chunk): # remove them from good oids. These will run in single mode so # we can figure out which ones are good or bad. if len(oid_chunk) == 1: - self.remove_from_good_oids(oid_chunk) - self._addBadOids(oid_chunk) - log.warn( - "No return result, marking as bad oid: {%s} {%s}", - self.configId, - oid_chunk, - ) + self._mark_bad_oids(oid_chunk) else: - log.warn( - "No return result, will run in separately to determine " - "which oids are valid: {%s} {%s}", + log.warning( + "No results returned, will run in separately to " + "determine which oids are valid device=%s oids=%s", self.configId, oid_chunk, ) - self.remove_from_good_oids(oid_chunk) + self._remove_from_good_oids(oid_chunk) else: for oid in oid_chunk: if oid not in update: - log.error( - "SNMP get did not return result: %s %s", - self.configId, - oid, - ) - self.remove_from_good_oids([oid]) - self._addBadOids([oid]) + self._mark_bad_oids([oid]) self.state = SnmpPerformanceCollectionTask.STATE_STORE_PERF try: for oid, value in update.items(): @@ -432,12 +433,7 @@ def _fetchPerfChunk(self, oid_chunk): # We should always get something useful back if value == "" or value is None: if oid not in self._bad_oids: - log.error( - "SNMP get returned empty value: %s %s", - self.configId, - oid, - ) - self._addBadOids([oid]) + self._mark_bad_oids([oid]) continue self._good_oids.add(oid) @@ -541,7 +537,7 @@ def _doCollectOids(self, ignored): except StopTask as e: taskStopped = True self._stoppedTaskCount += 1 - log.warn( + log.warning( "Device %s [%s] Task stopped collecting to avoid " "exceeding cycle interval - %s", self._devId, @@ -551,12 +547,20 @@ def _doCollectOids(self, ignored): self._logOidsNotCollected( "Task was stopped so as not exceed cycle interval" ) - except (error.TimeoutError, SnmpTimeoutError) as e: + except (error.TimeoutError, SnmpTimeoutError): log.debug( "Device %s [%s] snmp timed out ", self._devId, self._manageIp, ) + except SnmpUsmStatsError as ex: + message = _usm_stat_message.get(ex.oid) + if not message: + # The UnknownSecurityLevel message also works as a + # generic USM stats error message. + message = _usm_stat_message.get(oids.UnknownSecurityLevel) + log.error("%s device=%s", message, self._devId) + raise if self._snmpConnInfo.zSnmpVer == "v3": self._sendStatusEvent( @@ -636,7 +640,7 @@ def _doCollectOids(self, ignored): ) except CycleExceeded as e: self._cycleExceededCount += 1 - log.warn( + log.warning( "Device %s [%s] scan stopped because time exceeded " "cycle interval, %s", self._devId, @@ -648,44 +652,57 @@ def _doCollectOids(self, ignored): "Scan stopped; Collection time exceeded interval - %s" % e, eventKey="interval_exceeded", ) - except Snmpv3Error as e: + except SnmpUsmError as e: self._logOidsNotCollected("of %s" % (e,)) self._snmpV3ErrorCount += 1 - summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format( - self, e + log.error( + "cannot connect to SNMP agent device=%s error=%s", + self.configId, + e, + ) + self._sendStatusEvent( + "Cannot connect to SNMP agent on {0._devId}: {1}".format( + self, e + ), + eventKey="snmp_v3_error", ) - log.error("%s on %s", summary, self.configId) - self._sendStatusEvent(summary, eventKey="snmp_v3_error") - except SnmpError as e: + except NetSnmpError as e: self._logOidsNotCollected("of %s" % (e,)) - summary = "Cannot connect to SNMP agent on {0._devId}: {1}".format( - self, e + log.error( + "cannot connect to SNMP agent device=%s error=%s", + self.configId, + e, + ) + self._sendStatusEvent( + "Cannot connect to SNMP agent on {0._devId}: {1}".format( + self, e + ), + eventKey="snmp_error", ) - log.error("%s on %s", summary, self.configId) - self._sendStatusEvent(summary, eventKey="snmp_error") finally: self._logTaskOidInfo(previous_bad_oids) - def remove_from_good_oids(self, oids): + def _remove_from_good_oids(self, oids): self._good_oids.difference_update(oids) - def _addBadOids(self, oids): + def _mark_bad_oids(self, oids): """ Report any bad OIDs and then track the OID so we don't generate any further errors. """ # make sure oids aren't in good set - self.remove_from_good_oids(oids) + self._remove_from_good_oids(oids) for oid in oids: - if oid in self._oids: - self._bad_oids.add(oid) - names = [dp[0] for dp in self._oids[oid]] - summary = "Error reading value for %s (%s) on %s" % ( - names, - oid, - self._devId, - ) - log.warn(summary) + if oid in self._bad_oids or oid not in self._oids: + continue + self._bad_oids.add(oid) + names = [dp[0] for dp in self._oids[oid]] + log.warning( + "no result for oid device=%s oid=%s names=%s", + self._devId, + oid, + names, + ) def _finished(self, result): """ @@ -698,12 +715,12 @@ def _finished(self, result): try: self._close() except Exception as ex: - log.warn("Failed to close device %s: error %s", self._devId, ex) + log.warning("Failed to close device %s: error %s", self._devId, ex) doTask_end = datetime.now() duration = doTask_end - self._doTask_start if duration > timedelta(seconds=self._device.cycleInterval): - log.warn( + log.warning( "Collection for %s took %s seconds; " "cycle interval is %s seconds.", self.configId, @@ -793,7 +810,7 @@ def _logOidsNotCollected(self, reason): oidsNotCollected, ) - def _connect(self, result=None): + def _connect(self, ignored=None): """ Create a connection to the remote device """ @@ -813,7 +830,7 @@ def _connect(self, result=None): severity=Event.Clear, ) except Exception as ex: - self._snmpProxy = None + self._close() log.error("failed to create SNMP session: %s", ex) self._sendStatusEvent( "SNMP config error: {}".format(ex), diff --git a/Products/ZenRRD/zenprocess.py b/Products/ZenRRD/zenprocess.py index 643d6fd436..04ee1255b0 100755 --- a/Products/ZenRRD/zenprocess.py +++ b/Products/ZenRRD/zenprocess.py @@ -24,7 +24,7 @@ import zope.component import zope.interface -from pynetsnmp.twistedsnmp import Snmpv3Error +from pynetsnmp.twistedsnmp import SnmpUsmError from twisted.internet import defer, error from Products.ZenCollector.daemon import CollectorDaemon @@ -468,10 +468,10 @@ def _collectCallback(self): "%s; Timeout on device" % (PROC_SCAN_ERROR % self._devId,), TABLE_SCAN_TIMEOUT, ) - except Snmpv3Error as e: + except SnmpUsmError as e: msg = ( - "Cannot connect to SNMP agent on {0._devId}: {1.value}".format( - self, str(e) + "Cannot connect to SNMP agent on {0._devId}: {1}".format( + self, e ) ) log.debug(msg) @@ -773,7 +773,7 @@ def _fetchPerf(self): ) result = yield self._get(oidChunk) results.update(result) - except (error.TimeoutError, Snmpv3Error) as e: + except (error.TimeoutError, SnmpUsmError) as e: log.debug("error reading oid(s) %s - %s", oidChunk, e) singleOids.update(oidChunk) oidsToTest = []