Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parse_crawler_stats for None value #25

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,22 @@
# Python CircleCI 2.1 configuration file
version: 2.1

orbs:
codecov: codecov/[email protected]

jobs:
py37: &test-template
py39: &test-template
docker:
- image: circleci/python:3.7

- image: cimg/python:3.9
working_directory: ~/repo

parameters:
is-py27:
type: boolean
default: false

steps:
- run:
name: Install telnet
command: |
sudo apt-get update && sudo apt-get install telnet

- checkout

- when:
condition: <<parameters.is-py27>>
steps:
Expand All @@ -38,7 +31,6 @@ jobs:
name: Create virtual env in PY3
command: |
python3 -m venv venv

- run:
name: Install dependencies
command: |
Expand All @@ -49,7 +41,6 @@ jobs:
python --version
pip install -r requirements.txt
pip install -r requirements-tests.txt

- run:
name: Run tests
command: |
Expand All @@ -73,27 +64,38 @@ jobs:
path: coverage.xml
- codecov/upload:
file: coverage.xml

py27:
<<: *test-template
docker:
- image: circleci/python:2.7

py36:
- image: cimg/python:2.7
py38:
<<: *test-template
docker:
- image: circleci/python:3.6

py38:
- image: cimg/python:3.8
py310:
<<: *test-template
docker:
- image: cimg/python:3.10
py311:
<<: *test-template
docker:
- image: cimg/python:3.11
py312:
<<: *test-template
docker:
- image: cimg/python:3.12
py313:
<<: *test-template
docker:
- image: circleci/python:3.8

- image: cimg/python:3.13
workflows:
test:
jobs:
- py27:
is-py27: true
- py36
- py37
- py38
- py39
- py310
- py311
- py312
- py313
2 changes: 1 addition & 1 deletion logparser/__version__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# coding: utf-8

__title__ = 'logparser'
__version__ = '0.8.2'
__version__ = '0.8.3'
__author__ = 'my8100'
__author_email__ = '[email protected]'
__url__ = 'https://github.com/my8100/logparser'
Expand Down
10 changes: 7 additions & 3 deletions logparser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@
scrapy_version=r'Scrapy[ ]\d+\.\d+\.\d+[ ]started', # Scrapy 1.5.1 started (bot: demo)
telnet_console=r'Telnet[ ]console[ ]listening[ ]on', # Telnet console listening on 127.0.0.1:6023
# Default: 'scrapy' | Overridden settings: {'TELNETCONSOLE_USERNAME': 'usr'}
telnet_username=r'Overridden[ ]settings:.+TELNETCONSOLE_USERNAME',
telnet_username=r'TELNETCONSOLE_USERNAME\W:.+',
# Telnet Password: 865bba341ef25552 | Overridden settings: {'TELNETCONSOLE_PASSWORD': 'psw'}
telnet_password=r'Overridden[ ]settings:.+TELNETCONSOLE_PASSWORD|Telnet[ ]Password:[ ].+',
telnet_password=r'TELNETCONSOLE_PASSWORD\W:.+|Telnet[ ]Password:[ ].+',
resuming_crawl=r'Resuming[ ]crawl', # Resuming crawl (675840 requests scheduled)
latest_offsite=r'Filtered[ ]offsite', # Filtered offsite request to 'www.baidu.com'
latest_duplicate=r'Filtered[ ]duplicate', # Filtered duplicate request: <GET http://httpbin.org/headers>
Expand All @@ -71,7 +71,8 @@
_odict.update({k: LATEST_MATCHES_PATTERN_DICT[k]})
LATEST_MATCHES_PATTERN_DICT = _odict
for k, v in LATEST_MATCHES_PATTERN_DICT.items():
LATEST_MATCHES_PATTERN_DICT[k] = r'^%s[ ].+?%s' % (DATETIME_PATTERN, v)
if k not in ['telnet_username', 'telnet_password']:
LATEST_MATCHES_PATTERN_DICT[k] = r'^%s[ ].+?%s' % (DATETIME_PATTERN, v)

# 2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/headers>
LATEST_SCRAPE_ITEM_PATTERN = re.compile(r"""\n
Expand Down Expand Up @@ -179,11 +180,14 @@ def parse_crawler_stats(text):
# self.crawler.stats.inc_value(
# 'crawlera/response/error/%s' % crawlera_error.decode('utf8'))
# u"crawlera/response/error/timeout": 1
# 'items_per_minute': None,
# 'responses_per_minute': None,
backup = text
text = re.sub(r'(datetime.datetime\(.+?\))', r'"\1"', text)
text = re.sub(r'(".*?)\'(.*?)\'(.*?")', r'\1_\2_\3', text)
text = re.sub(r"'(.+?)'", r'"\1"', text)
text = re.sub(r'[bu]"(.+?)"', r'"\1"', text)
text = re.sub(r': None([,}])', r': null\1', text)
try:
return json.loads(text)
except ValueError as err:
Expand Down
62 changes: 46 additions & 16 deletions logparser/telnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
import os
import platform
import re
from subprocess import Popen, PIPE
import sys
from telnetlib import DO, DONT, IAC, SB, SE, Telnet, TTYPE, WILL, WONT
# DeprecationWarning: 'telnetlib' is deprecated and slated for removal in Python 3.13
try:
import telnetlib
except ImportError:
telnetlib = None
import traceback

import pexpect
Expand Down Expand Up @@ -52,6 +57,20 @@ def __init__(self, data, override_telnet_console_host, verbose):
self.crawler_stats = {}
self.crawler_engine = {}

def _exec_cmd(self, cmd):
self.logger.info("_exec_cmd: %s" % cmd)
# os.system(cmd)
try:
p = Popen(cmd.strip(), stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
output, err = p.communicate(timeout=30)
rc = p.returncode
output = output.decode('utf-8')
err = err.decode('utf-8')
except Exception as err:
self.logger.warning("Fail to exec cmd '%s': err %s" % (cmd, err))
else:
self.logger.info("Got result of cmd '%s': rc %s, err %s, output:\n%s" % (cmd, rc, err, output))

def main(self):
try:
self.run()
Expand All @@ -65,6 +84,7 @@ def main(self):
self.host, self.port, self.data['log_path'], self.scrapy_version, err)
if self.verbose:
self.logger.error(traceback.format_exc())
self._exec_cmd("telnet %s %s" % (self.host, self.port))
finally:
if self.tn is not None:
try:
Expand Down Expand Up @@ -95,7 +115,7 @@ def run(self):
self.host = self.OVERRIDE_TELNET_CONSOLE_HOST or self.host

self.logger.debug("Try to telnet to %s:%s for %s", self.host, self.port, self.data['log_path'])
if self.telnet_password:
if self.telnet_password or telnetlib is None:
self.setup_pexpect()
if self.tn is not None:
self.pexpect_io()
Expand All @@ -107,6 +127,7 @@ def run(self):
def setup_pexpect(self):
# Cannot catch error directly here, see main()
self.tn = pexpect.spawn('telnet %s %s' % (self.host, self.port), encoding='utf-8', timeout=TELNET_TIMEOUT)
self.logger.info('setup_pexpect %s' % self.tn)
# logfile: <open file '<stdout>', mode 'w' at 0x7fe160149150>
# logfile_read: None
# logfile_send: None
Expand All @@ -117,16 +138,17 @@ def setup_pexpect(self):

@staticmethod
def telnet_callback(tn, command, option):
if command == DO and option == TTYPE:
tn.sendall(IAC + WILL + TTYPE)
tn.sendall(IAC + SB + TTYPE + '\0' + 'LogParser' + IAC + SE)
elif command in (DO, DONT):
tn.sendall(IAC + WILL + option)
elif command in (WILL, WONT):
tn.sendall(IAC + DO + option)
if command == telnetlib.DO and option == telnetlib.TTYPE:
tn.sendall(telnetlib.IAC + telnetlib.WILL + telnetlib.TTYPE)
tn.sendall(telnetlib.IAC + telnetlib.SB + telnetlib.TTYPE + '\0' + 'LogParser' + telnetlib.IAC + telnetlib.SE)
elif command in (telnetlib.DO, telnetlib.DONT):
tn.sendall(telnetlib.IAC + telnetlib.WILL + option)
elif command in (telnetlib.WILL, telnetlib.WONT):
tn.sendall(telnetlib.IAC + telnetlib.DO + option)

def setup_telnet(self):
self.tn = Telnet(self.host, int(self.port), timeout=TELNET_TIMEOUT)
self.tn = telnetlib.Telnet(self.host, int(self.port), timeout=TELNET_TIMEOUT)
self.logger.info('setup_telnet %s' % self.tn)
# [twisted] CRITICAL: Unhandled Error
# Failure: twisted.conch.telnet.OptionRefused: twisted.conch.telnet.OptionRefused
# https://github.com/jookies/jasmin-web/issues/2
Expand All @@ -135,6 +157,7 @@ def setup_telnet(self):
self.tn.set_debuglevel(logging.DEBUG)

def parse_output(self, text):
self.logger.info('parse_output text: ###%s###' % text)
m = re.search(r'{.+}', text)
if m:
result = self.parse_crawler_stats(m.group())
Expand All @@ -148,12 +171,15 @@ def parse_output(self, text):
result[k] = True
elif v == 'False':
result[k] = False
elif v == 'None':
result[k] = None
else:
try:
result[k] = int(float(v))
except (TypeError, ValueError):
pass
if result:
self.logger.info('parse_output result: ###%s###' % result)
return self.get_ordered_dict(result, source='telnet')
else:
return {}
Expand All @@ -165,16 +191,20 @@ def bytes_to_str(src):
return src.decode('utf-8')
# TypeError: got <type 'str'> ('Username: ') as pattern,
# must be one of: <type 'unicode'>, pexpect.EOF, pexpect.TIMEOUT
self.tn.expect(u'Username: ', timeout=TELNET_TIMEOUT)
self.tn.sendline(self.telnet_username)
self.tn.expect(u'Password: ', timeout=TELNET_TIMEOUT)
self.tn.sendline(self.telnet_password)
self.tn.expect(u'>>>', timeout=TELNET_TIMEOUT)
try:
self.tn.expect(u'Username: ', timeout=TELNET_TIMEOUT)
self.tn.sendline(self.telnet_username)
self.tn.expect(u'Password: ', timeout=TELNET_TIMEOUT)
self.tn.sendline(self.telnet_password)
self.tn.expect(u'>>>', timeout=TELNET_TIMEOUT)
except Exception as err:
self.logger.warning("Found error in pexpect_io %s %s: %s" % (self.telnet_username, self.telnet_password, err))
raise err

self.tn.sendline(bytes_to_str(TELNETCONSOLE_COMMAND_MAP['log_file']))
self.tn.expect(re.compile(r'[\'"].+>>>', re.S), timeout=TELNET_TIMEOUT)
log_file = self.tn.after
self.logger.debug("settings['LOG_FILE'] found via telnet: %s", log_file)
self.logger.info("settings['LOG_FILE'] found via telnet: ###%s###" % log_file)
if not self.verify_log_file_path(self.parse_log_path(self.data['log_path']), log_file):
self.logger.warning("Skip telnet due to mismatching: %s AND %s", self.data['log_path'], log_file)
return
Expand Down
9 changes: 6 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,11 @@

classifiers=[
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7"
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
)
Loading