From b702b125f3d67d57580976542a45d9d883a7e02e Mon Sep 17 00:00:00 2001 From: Dylan Katz Date: Sun, 25 Oct 2020 23:44:43 -0700 Subject: [PATCH 1/5] Refactor paste site structure, added test --- pastehunter-cli | 39 ++++++++-- pastehunter/inputs/base_input.py | 52 +++++++++++++ pastehunter/inputs/ixio.py | 0 pastehunter/inputs/pastebin.py | 115 +++++++++++++++++++---------- pastehunter/inputs/slexy.py | 100 ++++++++++++------------- pastehunter/outputs/http_output.py | 3 +- test/test_paste_objects.py | 76 +++++++++++++++++++ 7 files changed, 286 insertions(+), 99 deletions(-) create mode 100644 pastehunter/inputs/base_input.py create mode 100644 pastehunter/inputs/ixio.py create mode 100644 test/test_paste_objects.py diff --git a/pastehunter-cli b/pastehunter-cli index 29bf926..8dfbcd1 100644 --- a/pastehunter-cli +++ b/pastehunter-cli @@ -21,6 +21,11 @@ from pastehunter.common import parse_config VERSION = 1.0 +# Decided not to make this configurable as it currently really only applies to pastebin but may change in functionality later. +# If someone would like this as a config key, please feel free to open an issue or a PR :) +# TODO: @Plazmaz +MAX_ITEM_RETRIES = 5 + # Setup Default logging root = logging.getLogger() ch = logging.StreamHandler() @@ -50,16 +55,18 @@ class timeout: def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds self.error_message = error_message + def handle_timeout(self, signum, frame): raise TimeoutError("Process timeout: {0}".format(self.error_message)) + def __enter__(self): signal.signal(signal.SIGALRM, self.handle_timeout) signal.alarm(self.seconds) + def __exit__(self, type, value, traceback): signal.alarm(0) - # Set up the log file if "log" in conf and conf["log"]["log_to_file"]: if conf["log"]["log_path"] != "": @@ -192,11 +199,31 @@ def paste_scanner(paste_data, rules_buff): if paste_site == 'slexy.org': headers['User-Agent'] = 'PasteHunter' - req = requests.get(raw_paste_uri, headers=headers) - if req.status_code == 200: - raw_paste_data = req.text - else: - logger.error("Request returned unexpected response code {}: {}".format(req.status_code, req.text)) + attempt_count = 0 + while attempt_count < MAX_ITEM_RETRIES: + attempt_count += 1 + req = requests.get(raw_paste_uri, headers=headers) + if req.status_code == 200: + raw_paste_data = req.text + if attempt_count > 1: + logger.warning('Successfully resolved 429 exception') + break + + # We may want to handle other status codes in the future, + # for now 429 is the only code we retry for, just to avoid issues with + # rate limiting and hammering sites for 404s or outages + elif req.status_code == 429: + logger.warning('Encountered unexpected 429 when requesting item at %s' + + ' for site "%s". Retrying (attempt %d)...', raw_paste_uri, + paste_site, attempt_count + 1) + sleep(10) + else: + logger.error("Request returned unexpected response code %d: %s", req.status_code, + req.text) + + if attempt_count > 1: + logger.error("Unable to resolve 429 exception after %d retries, giving up on item %s.", + MAX_ITEM_RETRIES, raw_paste_uri) # Cover fetch site SSLErrors except requests.exceptions.SSLError as e: diff --git a/pastehunter/inputs/base_input.py b/pastehunter/inputs/base_input.py new file mode 100644 index 0000000..68298ff --- /dev/null +++ b/pastehunter/inputs/base_input.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Any, Optional, Dict, List, Union + +import requests + + +class BasePasteSite(ABC): + def make_request(self, url: str, timeout: Optional[int] = 10, headers: Optional[Dict[str, Any]] = None): + """ + Make a request and return the results + :param url: The url to request + :param timeout: The timeout for the request + :param headers: The headers dict + :return: + """ + req = requests.get(url, headers=headers, timeout=timeout) + return req + + @abstractmethod + def remap_raw_item(self, raw_item: [str, Dict]) -> Dict[str, Any]: + """ + Takes a raw item and remaps it to a normalize paste dict + :param raw_item: + :return: The paste dict + """ + pass + + @abstractmethod + def get_paste_for_id(self, paste_id: Any) -> str: + """ + Returns a paste for the given paste_id + :param paste_id: The paste to retrieve + :return: A raw paste object + """ + pass + + @abstractmethod + def get_paste_id(self, paste_obj: Dict[str, Any]) -> Union[str, int]: + """ + Returns an id for the given paste object + :param paste_obj: The raw paste dict + :return: The paste i + passd (str or int) + """ + + @abstractmethod + def get_recent_items(self, input_history: List[str]): + """ + Gets recent items + :return: a list of recent items + """ + pass diff --git a/pastehunter/inputs/ixio.py b/pastehunter/inputs/ixio.py new file mode 100644 index 0000000..e69de29 diff --git a/pastehunter/inputs/pastebin.py b/pastehunter/inputs/pastebin.py index 23cb0a0..c25273e 100644 --- a/pastehunter/inputs/pastebin.py +++ b/pastehunter/inputs/pastebin.py @@ -1,49 +1,88 @@ +from typing import Any, Dict, Union, Optional + import requests import logging from datetime import datetime +from inputs.base_input import BasePasteSite + logger = logging.getLogger('pastehunter') -def recent_pastes(conf, input_history): - # populate vars from config - paste_limit = conf['inputs']['pastebin']['paste_limit'] - api_scrape = conf['inputs']['pastebin']['api_scrape'] - history = [] - paste_list = [] - try: - # Create the API uri - scrape_uri = '{0}?limit={1}'.format(api_scrape, paste_limit) - # Get some pastes and convert to json - # Get last 'paste_limit' pastes - paste_list_request = requests.get(scrape_uri) - - # Check to see if our IP is whitelisted or not. + +class PastebinPasteSite(BasePasteSite): + + def __init__(self, conf): + self.conf = conf + + def remap_raw_item(self, raw_item: Dict) -> Dict[str, Any]: + # Create a new paste dict for us to normalize + pid = self.get_paste_id(raw_item) + paste_data = raw_item + paste_data['filename'] = pid + paste_data['confname'] = 'pastebin' + paste_data['pasteid'] = pid + paste_data['pastesite'] = 'pastebin.com' + # Add a date field that kibana will map + date = datetime.utcfromtimestamp(float(paste_data['date'])).isoformat() + paste_data['@timestamp'] = date + return paste_data + + def make_request(self, url: str, timeout: Optional[int] = 10, headers: Optional[Dict[str, Any]] = None): + paste_list_request = super(PastebinPasteSite, self).make_request(url, timeout, headers) + + # Check to see if our IP is whitelisted or not. if 'DOES NOT HAVE ACCESS' in paste_list_request.text: logger.error("Your IP is not whitelisted visits 'https://pastebin.com/doc_scraping_api'") - return [], [] - paste_list_json = paste_list_request.json() - - for paste in paste_list_json: - # Track paste ids to prevent dupes - history.append(paste['key']) - if paste['key'] in input_history: - continue - - # Create a new paste dict for us to normalize - paste_data = paste - paste_data['filename'] = paste['key'] - paste_data['confname'] = 'pastebin' - paste_data['pasteid'] = paste['key'] - paste_data['pastesite'] = 'pastebin.com' - # Add a date field that kibana will map - date = datetime.utcfromtimestamp(float(paste_data['date'])).isoformat() - paste_data['@timestamp'] = date - paste_list.append(paste_data) - return paste_list, history - - except Exception as e: - logger.error("Unable to parse paste results: {0}".format(e)) - return paste_list, history + return None + return paste_list_request + + def get_paste_for_id(self, paste_id: Any) -> str: + pass + + def get_paste_id(self, paste_obj: Dict[str, Any]) -> Union[str, int]: + return paste_obj['key'] + + def get_recent_items(self, input_history): + paste_limit = self.conf['inputs']['pastebin']['paste_limit'] + api_scrape = self.conf['inputs']['pastebin']['api_scrape'] + + history = [] + paste_list = [] + try: + # Create the API uri + scrape_uri = '{0}?limit={1}'.format(api_scrape, paste_limit) + # Get some pastes and convert to json + # Get last 'paste_limit' pastes + + paste_list_request = self.make_request(scrape_uri) + + # IP not whitelisted + if not paste_list_request: + return [], [] + + paste_list_json = paste_list_request.json() + + for paste in paste_list_json: + pid = self.get_paste_id(paste) + # Track paste ids to prevent dupes + history.append(pid) + if pid in input_history: + continue + + paste_data = self.remap_raw_item(paste) + paste_list.append(paste_data) + + return paste_list, history + + except Exception as e: + logger.error("Unable to parse paste results: {0}".format(e)) + return paste_list, history + + +def recent_pastes(conf, input_history): + site = PastebinPasteSite(conf) + # populate vars from config + return site.get_recent_items(input_history) diff --git a/pastehunter/inputs/slexy.py b/pastehunter/inputs/slexy.py index e762fca..af3013f 100644 --- a/pastehunter/inputs/slexy.py +++ b/pastehunter/inputs/slexy.py @@ -2,33 +2,29 @@ import re from datetime import datetime from time import sleep +from typing import Any, Dict, Optional, List, Union -import requests +from inputs.base_input import BasePasteSite logger = logging.getLogger('pastehunter') -class SlexySite(object): +class SlexyPasteSite(BasePasteSite): def __init__(self): + self.url = None self.site = "slexy.org" url_slexy = "https://" + self.site self.url_recent = url_slexy + "/recent" self.url_view = url_slexy + "/view" self.url_raw = url_slexy + "/raw" - self.url = None - - def request_view_link(self, pid): - return self._make_request("%s/%s" % (self.url_view, pid)) - - def raw_link(self, pid, args): - return "%s/%s%s" % (self.url_raw, pid, args) - def _make_request(self, url): - req = requests.get(url, headers={ + def make_request(self, url: str, timeout: Optional[int] = 10, headers: Optional[Dict[str, Any]] = None): + req = super(SlexyPasteSite, self).make_request(url, timeout, { 'Referer': self.url_recent, 'User-Agent': 'PasteHunter' - }, timeout=10) + }) + ratelimit_limit = int(req.headers.get('RateLimit-Limit', 30)) remaining = int(req.headers.get('RateLimit-Remaining', 30)) logger.debug('Remaining Slexy Ratelimit: {0}'.format(remaining)) @@ -36,59 +32,55 @@ def _make_request(self, url): if req.status_code == 429: timeout = req.headers.get('Retry-After', 60) sleep(timeout) - return self._make_request(url) + return self.make_request(url, timeout) # If ratelimit_limit = 60, 60/60 = 1 # If ratelimit_limit = 30, 60/30 = 2 sleep(30 / ratelimit_limit) return req.text - -class SlexyPaste(SlexySite): - def __init__(self, pid): - super(SlexyPaste, self).__init__() - self.pid = pid - self.site = self.site - self.timestamp = None - self.parse() - - def parse(self): - data = self.request_view_link(self.pid) - self.timestamp = self.get_timestamp(data) - self.url = self.get_raw_link(data) - - def get_raw_link(self, data): - pattern = ' Union[str, int]: + return paste_obj.get('pasteid') + + def remap_raw_item(self, raw_item: [str, Dict]) -> Dict[str, Any]: + timestamp = self.get_timestamp(raw_item) + paste_id = self.get_paste_id(raw_item) + raw_url = self.get_raw_link(raw_item, paste_id) + self.get_paste_id(raw_item) + return { + 'confname': 'slexy', + 'scrape_url': raw_url, + 'pasteid': paste_id, + 'pastesite': self.site, + '@timestamp': timestamp + } + + def get_raw_data(self, raw_url): + return self.make_request(raw_url) + + def get_paste_for_id(self, paste_id: Any) -> str: + return self.make_request("%s/%s" % (self.url_view, paste_id)) + + def get_raw_link(self, data, pid): + pattern = '', getdata) + def get_recent_items(self, input_history: List[str]): + data = self.make_request(self.url_recent) + pids = re.findall('', data) return list(set(pids)) def recent_pastes(conf, input_history): history = [] paste_list = [] - my_scraper = SlexyScraper() - recent_pids = my_scraper.get_recents() + my_scraper = SlexyPasteSite() + recent_pids = my_scraper.get_recent_items(input_history) pid_to_process = set() for pid in recent_pids: if pid in input_history: @@ -97,14 +89,14 @@ def recent_pastes(conf, input_history): pid_to_process.add(pid) try: for pid in pid_to_process: - paste = SlexyPaste(pid) - history.append(paste.pid) + paste_data = my_scraper.get_paste_for_id(pid) + raw = my_scraper.get_raw_link(paste_data, pid) paste_data = { 'confname': 'slexy', - 'scrape_url': paste.url, - 'pasteid': paste.pid, - 'pastesite': paste.site, - '@timestamp': paste.timestamp + 'scrape_url': raw, + 'pasteid': pid, + 'pastesite': my_scraper.site, + '@timestamp': my_scraper.get_timestamp(paste_data) } paste_list.append(paste_data) return paste_list, history diff --git a/pastehunter/outputs/http_output.py b/pastehunter/outputs/http_output.py index 430acf4..f90fc5f 100644 --- a/pastehunter/outputs/http_output.py +++ b/pastehunter/outputs/http_output.py @@ -1,6 +1,7 @@ import logging + import requests -import json + from pastehunter.common import parse_config logger = logging.getLogger('pastehunter') diff --git a/test/test_paste_objects.py b/test/test_paste_objects.py new file mode 100644 index 0000000..1aab169 --- /dev/null +++ b/test/test_paste_objects.py @@ -0,0 +1,76 @@ +from inputs import slexy +from inputs.pastebin import PastebinPasteSite +from inputs.slexy import SlexyPasteSite + + +pids_found = [] +def mock_get_paste_for_pid(pid): + pids_found.append(pid) + return "pid_is_" + pid + + +class FakeRequestJson(object): + def __init__(self, ret): + self.ret = ret + + def json(self): + return self.ret + + +def test_slexy_site(): + pid_list_fake = [0, 1, 2, 3, 4] + slexy_site = SlexyPasteSite() + slexy_site.get_recent_items = lambda: pid_list_fake + slexy_site.get_paste_for_id = lambda pid: mock_get_paste_for_pid(str(pid)) + slexy_site.remap_raw_item = lambda raw_data, pid: {"pid": 123} + recent_pids = slexy_site.get_recent_items() + assert recent_pids == pid_list_fake + for pid in recent_pids: + paste = slexy_site.get_paste_for_id(pid) + paste_data = slexy_site.remap_raw_item(paste, pid) + assert paste == 'pid_is_' + str(pid) + assert paste_data == {"pid": 123} + +def test_pastebin_site_remap(): + fake_conf = { + 'inputs': { + 'pastebin': { + 'paste_limit': 100, + 'api_scrape': 'https://scrape.pastebin.com/api_scraping.php' + } + } + } + data = { + 'key': 'a', + 'test': 'b', + 'date': '1582595793' + } + pastebin_site = PastebinPasteSite(fake_conf) + out = pastebin_site.remap_raw_item(data) + assert out == {'key': 'a', 'test': 'b', 'date': '1582595793', 'filename': 'a', 'confname': 'pastebin', + 'pasteid': 'a', 'pastesite': 'pastebin.com', '@timestamp': '2020-02-25T01:56:33'} + +def test_pastebin_site(): + fake_conf = { + 'inputs': { + 'pastebin': { + 'paste_limit': 100, + 'api_scrape': 'https://scrape.pastebin.com/api_scraping.php' + } + } + } + pastebin_site = PastebinPasteSite(fake_conf) + pastebin_site.make_request = lambda url: FakeRequestJson([ + { + 'key': 'ab', + 'date': '1582595793' + }, + { + 'key': 'bc', + 'date': '1582595793' + } + ]) + pastes, paste_ids = pastebin_site.get_recent_items([]) + assert paste_ids == ['ab', 'bc'] + assert pastes[0].get('key') == 'ab' + assert pastes[1].get('key') == 'bc' From 583c2078c7ee0b4694b2c0e5f396b78af6b13e95 Mon Sep 17 00:00:00 2001 From: Dylan Katz Date: Sat, 31 Oct 2020 00:07:38 -0700 Subject: [PATCH 2/5] Added support for ixio Resolves #95 Since this is a smallish site I've disabled the input by default. To the owner of ixio, if you'd like us to implement ratelimiting mechanisms please reach out! --- pastehunter/common.py | 26 ++ pastehunter/inputs/base_input.py | 2 +- pastehunter/inputs/ixio.py | 114 ++++++ settings.json.sample | 9 +- test/test_base62.py | 11 + test/test_ix.py | 607 +++++++++++++++++++++++++++++++ 6 files changed, 766 insertions(+), 3 deletions(-) create mode 100644 test/test_base62.py create mode 100644 test/test_ix.py diff --git a/pastehunter/common.py b/pastehunter/common.py index 54fd4a3..08023df 100644 --- a/pastehunter/common.py +++ b/pastehunter/common.py @@ -5,6 +5,10 @@ logger = logging.getLogger('pastehunter') home = os.path.expanduser("~") +BASE62_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' +BASE_LOOKUP = dict((c, i) for i, c in enumerate(BASE62_CHARS)) +BASE_LEN = len(BASE62_CHARS) + # Parse the config file in to a dict def parse_config(): conf = None @@ -26,3 +30,25 @@ def parse_config(): logger.error("Unable to read config file '~/.config/pastehunter.json'") return conf + + +# Most of this was pulled from https://stackoverflow.com/a/2549514 +def base62_decode(input: str) -> int: + length = len(BASE_LOOKUP) + ret = 0 + for i, c in enumerate(input[::-1]): + ret += (length ** i) * BASE_LOOKUP[c] + + return ret + + +def base62_encode(integer) -> str: + if integer == 0: + return BASE62_CHARS[0] + + ret = '' + while integer != 0: + ret = BASE62_CHARS[integer % BASE_LEN] + ret + integer //= BASE_LEN + + return ret diff --git a/pastehunter/inputs/base_input.py b/pastehunter/inputs/base_input.py index 68298ff..f224223 100644 --- a/pastehunter/inputs/base_input.py +++ b/pastehunter/inputs/base_input.py @@ -39,7 +39,7 @@ def get_paste_id(self, paste_obj: Dict[str, Any]) -> Union[str, int]: """ Returns an id for the given paste object :param paste_obj: The raw paste dict - :return: The paste i + :return: The paste id passd (str or int) """ diff --git a/pastehunter/inputs/ixio.py b/pastehunter/inputs/ixio.py index e69de29..bb0a191 100644 --- a/pastehunter/inputs/ixio.py +++ b/pastehunter/inputs/ixio.py @@ -0,0 +1,114 @@ +import logging +import re +from datetime import datetime +from time import sleep +from typing import List, Any, Dict, Union + +import requests + +from common import base62_decode, base62_encode +from inputs.base_input import BasePasteSite + +logger = logging.getLogger('pastehunter') + + +class IxDotIoSite(BasePasteSite): + # Yeah, yeah, I know, no regex for HTML parsing... + # If we end up doing a lot more of this, then maybe we'll use beautifulsoup or something. + # Capturing groups: + # 1. Paste ID + # 2. Timestamp + _ITEM_ID_RE: re.Pattern = re.compile('
[\\sa-zA-Z0-9]+' + '\\[r][^\r\n]+' + '\\s+@ (.*?)[\r\n]') + + def __init__(self, conf): + self.conf = conf + self.site = "ix.io" + url_main = "http://" + self.site + self.url_recent = url_main + "/user/" + self.view_pattern = url_main + "/{}/" + self.raw_pattern = url_main + "/{}" + self.url = None + + def remap_raw_item(self, raw_item: [str, Dict]) -> Dict[str, Any]: + pid = raw_item['pid'] + paste_data = { + # at a + 'filename': str(pid), + 'confname': 'ixio', + 'pastesite': self.site, + 'pasteid': pid, + } + # Timezone is UTC/Zulu + date = datetime.strptime(raw_item['date'], '%a %b %d %H:%M:%S %Y').isoformat() + paste_data['@timestamp'] = date + encoded_pid = self.get_paste_id(paste_data) + paste_data['scrape_url'] = self.raw_pattern.format(encoded_pid) + return paste_data + + def get_paste_for_id(self, paste_id: Any) -> str: + self.make_request(self.raw_pattern.format(paste_id)) + + def get_paste_id(self, paste_obj: Dict[str, Any]) -> str: + decoded = paste_obj.get('pasteid') + return base62_encode(decoded) + + def get_recent_items(self, input_history: List[str]): + + history = [] + paste_list = [] + try: + recent_page = self.make_request(self.url_recent) + item_data = self.get_data_for_page(recent_page.text) + + for val in item_data: + # Track paste ids to prevent dupes + pid = val['pid'] + history.append(pid) + if pid in input_history: + continue + paste_data = self.remap_raw_item(val) + paste_list.append(paste_data) + + return paste_list, history + + except Exception as e: + logger.error("Unable to parse ixio items: {0}".format(e)) + return paste_list, history + + def get_data_for_page(self, page_data: str) -> List[Dict[str, Union[int, str]]]: + page: List[Dict[str, Union[int, str]]] = [] + last_item_id = -1 + regex_matches = self._ITEM_ID_RE.findall(page_data) + # We are going to reverse the order because ix pages are structured newest -> oldest, and this makes it simpler. + regex_matches.reverse() + for encoded_id, created_at in regex_matches: + # Okay so the logic here is a bit tricky. Basically, ix's all user page only returns anonymous pastes + # BUT! We can infer the paste ids that aren't present by filling in the blanks, because ix IDs are + # incremental. So first, we base62 decode the value so we can use it as an int + item_id = base62_decode(encoded_id) + # Then, we check if we've seen another value. If this is our first, we can skip a lot of this logic. + # (we probably don't want to go back and grab every ix paste historically for most use cases) + if last_item_id == -1: + page.append({'pid': item_id, 'date': created_at}) + last_item_id = item_id + # If there has been a delta, let's traverse it. + elif item_id - last_item_id > 1: + # We've already hit last_item_id so we skip that and fill in the delta + for i in range(last_item_id + 1, item_id + 1): + # Copy the created date as a best guess + page.append({'pid': i, 'date': created_at}) + last_item_id = item_id + else: + # If there's no delta, just add this nromally + page.append({'pid': item_id, 'date': created_at}) + last_item_id = item_id + return page + + +def recent_pastes(conf, input_history): + site = IxDotIoSite(conf) + + # populate vars from config + return site.get_recent_items(input_history) diff --git a/settings.json.sample b/settings.json.sample index 5a032d8..4a43f5d 100644 --- a/settings.json.sample +++ b/settings.json.sample @@ -5,7 +5,12 @@ "module": "pastehunter.inputs.pastebin", "api_scrape": "https://scrape.pastebin.com/api_scraping.php", "api_raw": "https://scrape.pastebin.com/api_scrape_item.php?i=", - "paste_limit": 200, + "paste_limit": 100, + "store_all": false + }, + "ixio":{ + "enabled": false, + "module": "pastehunter.inputs.ixio", "store_all": false }, "dumpz": { @@ -21,7 +26,7 @@ "enabled": true, "module": "pastehunter.inputs.gists", "api_token": "", - "api_limit": 100, + "api_limit": 200, "store_all": false, "user_blacklist": [], "file_blacklist": ["grahamcofborg-eval-package-list", "Changed Paths"] diff --git a/test/test_base62.py b/test/test_base62.py new file mode 100644 index 0000000..241cae2 --- /dev/null +++ b/test/test_base62.py @@ -0,0 +1,11 @@ +from common import base62_decode, base62_encode + +def test_b62_encode(): + assert base62_encode(622708) == '2BZG' + assert base62_encode(622707) == '2BZF' + +def test_b62_decode(): + assert base62_decode('1') == 1 + assert base62_decode('a') == 10 + assert base62_decode('2BZF') == 622707 + assert base62_decode('2BZG') == 622708 \ No newline at end of file diff --git a/test/test_ix.py b/test/test_ix.py new file mode 100644 index 0000000..8892602 --- /dev/null +++ b/test/test_ix.py @@ -0,0 +1,607 @@ +from inputs.ixio import IxDotIoSite +test_data = ''' + +2CmG +[r] [h] +@ Thu Oct 29 07:00:19 2020 +
+ +
+2CmF +[r] [h] +@ Thu Oct 29 07:00:14 2020 +
+ +
+2CmE +[r] [h] +@ Thu Oct 29 07:00:13 2020 +
+ +
+2CmD +[r] [h] +@ Thu Oct 29 06:47:22 2020 +
+ +
+2CmC +[r] [h] +@ Thu Oct 29 06:26:48 2020 +
+ +
+2CmB +[r] [h] +@ Thu Oct 29 06:21:48 2020 +
+ +
+2CmA +[r] [h] +@ Thu Oct 29 06:19:33 2020 +
+ +
+2Cmz +[r] [h] +@ Thu Oct 29 06:08:17 2020 +
+ +
+2Cmy +[r] [h] +@ Thu Oct 29 06:00:16 2020 +
+ +
+2Cmx +[r] [h] +@ Thu Oct 29 06:00:14 2020 +
+ +
+2Cmw +[r] [h] +@ Thu Oct 29 05:03:04 2020 +
+ +
+2Cmv +[r] [h] +@ Thu Oct 29 05:03:02 2020 +
+ +
+2Cmu +[r] [h] +@ Thu Oct 29 05:00:09 2020 +
+ +
+2Cmt +[r] [h] +@ Thu Oct 29 04:58:15 2020 +
+ +
+2Cms +[r] [h] +@ Thu Oct 29 04:57:39 2020 +
+ +
+2Cmr +[r] [h] +@ Thu Oct 29 04:57:31 2020 +
+ +
+2Cmq +[r] [h] +@ Thu Oct 29 04:57:24 2020 +
+ +
+2Cmp +[r] [h] +@ Thu Oct 29 04:51:45 2020 +
+ +
+2Cmo +[r] [h] +@ Thu Oct 29 04:10:10 2020 +
+ +
+2Cmn +[r] [h] +@ Thu Oct 29 04:09:34 2020 +
+ +
+2Cmm +[r] [h] +@ Thu Oct 29 04:02:17 2020 +
+ +
+2Cml +[r] [h] +@ Thu Oct 29 04:00:14 2020 +
+ +
+2Cmk +[r] [h] +@ Thu Oct 29 04:00:04 2020 +
+ +
+2Cmj +[r] [h] +@ Thu Oct 29 03:58:55 2020 +
+ +
+2Cmi +[r] [h] +@ Thu Oct 29 03:57:40 2020 +
+ +
+2Cmh +[r] [h] +@ Thu Oct 29 03:50:57 2020 +
+ +
+2Cmg +[r] [h] +@ Thu Oct 29 03:42:28 2020 +
+ +
+2Cmf +[r] [h] +@ Thu Oct 29 03:40:56 2020 +
+ +
+2Cme +[r] [h] +@ Thu Oct 29 03:27:14 2020 +
+ +
+2Cmd +[r] [h] +@ Thu Oct 29 03:26:44 2020 +
+ +
+2Cmc +[r] [h] +@ Thu Oct 29 03:26:29 2020 +
+ +
+2Cmb +[r] [h] +@ Thu Oct 29 03:22:12 2020 +
+ +
+2Cma +[r] [h] +@ Thu Oct 29 03:19:14 2020 +
+ +
+2Cm9 +[r] [h] +@ Thu Oct 29 03:19:00 2020 +
+ +
+2Cm8 +[r] [h] +@ Thu Oct 29 03:18:46 2020 +
+ +
+2Cm7 +[r] [h] +@ Thu Oct 29 03:18:05 2020 +
+ +
+2Cm6 +[r] [h] +@ Thu Oct 29 03:00:16 2020 +
+ +
+2Cm5 +[r] [h] +@ Thu Oct 29 02:59:56 2020 +
+ +
+2Cm4 +[r] [h] +@ Thu Oct 29 02:54:27 2020 +
+ +
+2Cm3 +[r] [h] +@ Thu Oct 29 02:30:04 2020 +
+ +
+2Cm1 +[r] [h] +@ Thu Oct 29 02:09:03 2020 +
+ +
+2Cm0 +[r] [h] +@ Thu Oct 29 02:04:08 2020 +
+ +
+2ClZ +[r] [h] +@ Thu Oct 29 02:02:27 2020 +
+ +
+2ClY +[r] [h] +@ Thu Oct 29 02:00:14 2020 +
+ +
+2ClX +[r] [h] +@ Thu Oct 29 02:00:13 2020 +
+ +
+2ClW +[r] [h] +@ Thu Oct 29 02:00:08 2020 +
+ +
+2ClV +[r] [h] +@ Thu Oct 29 01:56:47 2020 +
+ +
+2ClU +[r] [h] +@ Thu Oct 29 01:41:09 2020 +
+ +
+2ClS +[r] [h] +@ Thu Oct 29 01:30:02 2020 +
+ +
+2ClR +[r] [h] +@ Thu Oct 29 01:19:24 2020 +
+ +
+2ClQ +[r] [h] +@ Thu Oct 29 01:17:03 2020 +
+ +
+2ClP +[r] [h] +@ Thu Oct 29 01:00:13 2020 +
+ +
+2ClO +[r] [h] +@ Thu Oct 29 01:00:09 2020 +
+ +
+2ClN +[r] [h] +@ Thu Oct 29 00:46:53 2020 +
+ +
+2ClM +[r] [h] +@ Thu Oct 29 00:42:01 2020 +
+ +
+2ClL +[r] [h] +@ Thu Oct 29 00:27:03 2020 +
+ +
+2ClK +[r] [h] +@ Thu Oct 29 00:26:44 2020 +
+ +
+2ClJ +[r] [h] +@ Thu Oct 29 00:26:25 2020 +
+ +
+2ClI +[r] [h] +@ Thu Oct 29 00:26:05 2020 +
+ +
+2ClH +[r] [h] +@ Thu Oct 29 00:16:21 2020 +
+ +
+2ClG +[r] [h] +@ Thu Oct 29 00:16:07 2020 +
+ +
+2ClF +[r] [h] +@ Thu Oct 29 00:00:14 2020 +
+ +
+2ClE +[r] [h] +@ Thu Oct 29 00:00:07 2020 +
+ +
+2ClD +[r] [h] +@ Wed Oct 28 23:56:36 2020 +
+ +
+2ClC +[r] [h] +@ Wed Oct 28 23:54:07 2020 +
+2ClB +[r] [h] +@ Wed Oct 28 23:53:07 2020 + +2ClA +[r] [h] +@ Wed Oct 28 23:51:55 2020 + + +
+2Clz +[r] [h] +@ Wed Oct 28 23:50:24 2020 +
+ +
+2Cly +[r] [h] +@ Wed Oct 28 23:44:58 2020 +
+ +
+2Clx +[r] [h] +@ Wed Oct 28 23:40:54 2020 +
+ +
+2Clw +[r] [h] +@ Wed Oct 28 23:40:13 2020 +
+ +
+2Clv +[r] [h] +@ Wed Oct 28 23:38:37 2020 +
+ +
+2Clu +[r] [h] +@ Wed Oct 28 23:37:22 2020 +
+ +
+2Clt +[r] [h] +@ Wed Oct 28 23:31:22 2020 +
+ +
+2Cls +[r] [h] +@ Wed Oct 28 23:30:27 2020 +
+ +
+2Clr +[r] [h] +@ Wed Oct 28 23:25:57 2020 +
+ +
+2Clq +[r] [h] +@ Wed Oct 28 23:25:24 2020 +
+ +
+2Clo +[r] [h] +@ Wed Oct 28 23:07:09 2020 +
+ +
+2Cln +[r] [h] +@ Wed Oct 28 23:05:48 2020 +
+ +
+2Clm +[r] [h] +@ Wed Oct 28 23:02:16 2020 +
+ +
+2Cll +[r] [h] +@ Wed Oct 28 23:00:14 2020 +
+ +
+2Clk +[r] [h] +@ Wed Oct 28 23:00:07 2020 +
+ +
+2Clj +[r] [h] +@ Wed Oct 28 22:35:28 2020 +
+ +
+2Cli +[r] [h] +@ Wed Oct 28 22:32:50 2020 +
+ +
+2Clh +[r] [h] +@ Wed Oct 28 22:27:14 2020 +
+ +
+2Clg +[r] [h] +@ Wed Oct 28 22:16:44 2020 +
+ +
+2Clf +[r] [h] +@ Wed Oct 28 22:15:30 2020 +
+ +
+2Cle +[r] [h] +@ Wed Oct 28 22:14:18 2020 +
+ +
+2Cld +[r] [h] +@ Wed Oct 28 22:13:33 2020 +
+ +
+2Clc +[r] [h] +@ Wed Oct 28 22:11:11 2020 +
+ +
+2Clb +[r] [h] +issue #15767 @ Wed Oct 28 22:09:53 2020 +
+ +
+2Cla +[r] [h] +@ Wed Oct 28 22:08:25 2020 +
+ +
+2Cl9 +[r] [h] +@ Wed Oct 28 22:04:26 2020 +
+ +
+2Cl7 +[r] [h] +@ Wed Oct 28 22:00:23 2020 +
+ +
+2Cl6 +[r] [h] +@ Wed Oct 28 22:00:13 2020 +
+ +
+2Cl5 +[r] [h] +@ Wed Oct 28 22:00:09 2020 +
+ +
+2Cl4 +[r] [h] +@ Wed Oct 28 21:59:27 2020 +
+ +
+2Cl3 +[r] [h] +0001-DTS-sun8i-h2-plus-orangepi-zero-added-audio-codec.patch @ Wed Oct 28 21:58:51 2020 +
+ +
+2Cl2 +[r] [h] +@ Wed Oct 28 21:58:17 2020 +
+ +
+2Cl1 +[r] [h] +@ Wed Oct 28 21:56:42 2020 +
+ +
+ +''' +def test_page_items(): + site = IxDotIoSite(None) + ids = [x['pid'] for x in site.get_data_for_page(test_data)] + assert ids == [i for i in range(624031, 624134)] + +def \ No newline at end of file From 8975c6144da1049d47567293cf0ebdb081d6ed79 Mon Sep 17 00:00:00 2001 From: Dylan Katz Date: Sat, 31 Oct 2020 00:37:45 -0700 Subject: [PATCH 3/5] Whoops, fixed syntax error in test --- test/test_ix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_ix.py b/test/test_ix.py index 8892602..a82f33d 100644 --- a/test/test_ix.py +++ b/test/test_ix.py @@ -599,9 +599,9 @@
''' + + def test_page_items(): site = IxDotIoSite(None) ids = [x['pid'] for x in site.get_data_for_page(test_data)] assert ids == [i for i in range(624031, 624134)] - -def \ No newline at end of file From 1d21e024364ac07200c33b2fec3ada7d54fef894 Mon Sep 17 00:00:00 2001 From: Dylan Katz Date: Sat, 14 Nov 2020 22:21:02 -0800 Subject: [PATCH 4/5] A few linting and docs tweaks --- .travis.yml | 3 ++- README.md | 4 ---- conftest.py | 2 -- docs/inputs.rst | 6 ++++++ pastehunter/inputs/ixio.py | 7 ++----- test/test_base62.py | 4 +++- test/test_paste_objects.py | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) delete mode 100644 conftest.py diff --git a/.travis.yml b/.travis.yml index 00ef9dc..afe4bdc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,8 +24,9 @@ install: - pip install pytest - pip install -e . script: +- wget https://raw.githubusercontent.com/kevthehermit/PasteHunter/master/settings.json.sample -O ~/.config/pastehunter.json - pastehunter-cli -- pytest +- python -m pytest after_success: - python setup.py sdist deploy: diff --git a/README.md b/README.md index 9198c03..7904126 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,6 @@ PasteHunter is a python3 application that is designed to query a collection of s For all the pastes it finds it scans the raw contents against a series of Yara rules looking for information that can be used by an organisation or a researcher. -## Pastebin API Deprecated - -We are aware that the pastebin scraping API has been deprectated and are reviewing. - ## Setup For setup instructions please see the official documentation https://pastehunter.readthedocs.io/en/latest/installation.html diff --git a/conftest.py b/conftest.py deleted file mode 100644 index c1ef2bd..0000000 --- a/conftest.py +++ /dev/null @@ -1,2 +0,0 @@ -# This is needed for some weird pytest hackery -# See https://stackoverflow.com/a/50610630/1742813 \ No newline at end of file diff --git a/docs/inputs.rst b/docs/inputs.rst index 7e84ff7..7384394 100644 --- a/docs/inputs.rst +++ b/docs/inputs.rst @@ -63,6 +63,12 @@ Slexy has some heavy rate limits (30 requests per 30 seconds), but may still ret - **api_raw**: The URL endpoint for the raw paste. - **api_view**: The URL enpoint to view the paste. +ix.io +--------- + +ix.io is a smaller site used primarily for console/command line pastes. + +- **store_all**: Store all pastes regardless of a rule match. StackExchange ------------- diff --git a/pastehunter/inputs/ixio.py b/pastehunter/inputs/ixio.py index bb0a191..9f220fd 100644 --- a/pastehunter/inputs/ixio.py +++ b/pastehunter/inputs/ixio.py @@ -1,10 +1,7 @@ import logging import re from datetime import datetime -from time import sleep -from typing import List, Any, Dict, Union - -import requests +from typing import List, Any, Dict, Union, Pattern from common import base62_decode, base62_encode from inputs.base_input import BasePasteSite @@ -18,7 +15,7 @@ class IxDotIoSite(BasePasteSite): # Capturing groups: # 1. Paste ID # 2. Timestamp - _ITEM_ID_RE: re.Pattern = re.compile('
[\\sa-zA-Z0-9]+' + _ITEM_ID_RE: Pattern = re.compile('
[\\sa-zA-Z0-9]+' '\\[r][^\r\n]+' '\\s+@ (.*?)[\r\n]') diff --git a/test/test_base62.py b/test/test_base62.py index 241cae2..9d29df0 100644 --- a/test/test_base62.py +++ b/test/test_base62.py @@ -1,9 +1,11 @@ -from common import base62_decode, base62_encode +from pastehunter.common import base62_decode, base62_encode + def test_b62_encode(): assert base62_encode(622708) == '2BZG' assert base62_encode(622707) == '2BZF' + def test_b62_decode(): assert base62_decode('1') == 1 assert base62_decode('a') == 10 diff --git a/test/test_paste_objects.py b/test/test_paste_objects.py index 1aab169..ce85bed 100644 --- a/test/test_paste_objects.py +++ b/test/test_paste_objects.py @@ -1,9 +1,9 @@ -from inputs import slexy from inputs.pastebin import PastebinPasteSite from inputs.slexy import SlexyPasteSite - pids_found = [] + + def mock_get_paste_for_pid(pid): pids_found.append(pid) return "pid_is_" + pid From 938e1367636723e1a60df0ea035879fbaed34bd1 Mon Sep 17 00:00:00 2001 From: Dylan Katz Date: Sun, 22 Nov 2020 22:42:25 -0800 Subject: [PATCH 5/5] Updated changelog, tweaked a few formats and other values --- .travis.yml | 1 - CHANGELOG.md | 10 ++++++++++ __init__.py | 0 pastehunter-cli | 34 ++++++++++++++++++---------------- pytest.ini | 3 ++- settings.json.sample | 2 +- setup.py | 2 +- test/test_paste_objects.py | 8 +++++--- 8 files changed, 37 insertions(+), 23 deletions(-) create mode 100644 __init__.py diff --git a/.travis.yml b/.travis.yml index afe4bdc..e061b28 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,6 @@ install: - pip install pytest - pip install -e . script: -- wget https://raw.githubusercontent.com/kevthehermit/PasteHunter/master/settings.json.sample -O ~/.config/pastehunter.json - pastehunter-cli - python -m pytest after_success: diff --git a/CHANGELOG.md b/CHANGELOG.md index f7197a8..354fbf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.4.0] - 2020-11-22 +## Changed +- Added some error state checks and retry logic to pastebin scraping (#116) +- Refactored paste inputs to use a base class + +## Added +- Support for ix.io (#95) +- Additional unit tests (pytest still has some issues with import paths on travis) + + ## [1.3.2] - 2020-02-15 ### Changed Minor patch fixing error in email yara regexp diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pastehunter-cli b/pastehunter-cli index 8dfbcd1..8587179 100644 --- a/pastehunter-cli +++ b/pastehunter-cli @@ -19,7 +19,7 @@ import yara import pastehunter from pastehunter.common import parse_config -VERSION = 1.0 +VERSION = '1.4.0' # Decided not to make this configurable as it currently really only applies to pastebin but may change in functionality later. # If someone would like this as a config key, please feel free to open an issue or a PR :) @@ -38,7 +38,7 @@ logger = logging.getLogger('pastehunter') logger.setLevel(logging.INFO) # Version info -logger.info("Starting PasteHunter Version: {0}".format(VERSION)) +logger.info("Starting PasteHunter Version: {}".format(VERSION)) # Parse the config file logger.info("Reading Configs") @@ -48,9 +48,11 @@ conf = parse_config() if not conf: sys.exit() + class TimeoutError(Exception): pass + class timeout: def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds @@ -72,17 +74,19 @@ if "log" in conf and conf["log"]["log_to_file"]: if conf["log"]["log_path"] != "": logfile = "{0}/{1}.log".format(conf["log"]["log_path"], conf["log"]["log_file"]) # Assure directory exists - try: os.makedirs(conf["log"]["log_path"], exist_ok=True) # Python>3.2 + try: + os.makedirs(conf["log"]["log_path"], exist_ok=True) # Python>3.2 except TypeError: try: os.makedirs(conf["log"]["log_path"]) - except OSError as exc: # Python >2.5 + except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(conf["log"]["log_path"]): pass - else: logger.error("Can not create log file {0}: {1}".format(conf["log"]["log_path"], exc)) + else: + logger.error("Can not create log file {0}: {1}".format(conf["log"]["log_path"], exc)) else: logfile = "{0}.log".format(conf["log"]["log_file"]) - fileHandler = handlers.RotatingFileHandler(logfile, mode='a+', maxBytes=(1048576*5), backupCount=7) + fileHandler = handlers.RotatingFileHandler(logfile, mode='a+', maxBytes=(1048576 * 5), backupCount=7) if conf["log"]["format"] != "": fileFormatter = logging.Formatter("{0}".format(conf["log"]["format"])) fileHandler.setFormatter(fileFormatter) @@ -118,7 +122,6 @@ for input_type, input_values in conf["inputs"].items(): input_list.append(input_values["module"]) logger.info("Enabled Input: {0}".format(input_type)) - # Configure Outputs logger.info("Configure Outputs") outputs = [] @@ -296,7 +299,6 @@ def paste_scanner(paste_data, rules_buff): # remove the confname key as its not really needed past this point del paste_data['confname'] - # Blacklist Check # If any of the blacklist rules appear then empty the result set blacklisted = False @@ -310,7 +312,6 @@ def paste_scanner(paste_data, rules_buff): return True return False - # Post Process # If post module is enabled and the paste has a matching rule. @@ -322,14 +323,13 @@ def paste_scanner(paste_data, rules_buff): logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"])) post_module = importlib.import_module(post_values["module"]) post_results = post_module.run(results, - raw_paste_data, - paste_data - ) + raw_paste_data, + paste_data + ) # Throw everything back to paste_data for ease. paste_data = post_results - # If we have a result add some meta data and send to storage # If results is empty, ie no match, and store_all is True, # then append "no_match" to results. This will then force output. @@ -356,6 +356,7 @@ def paste_scanner(paste_data, rules_buff): except Exception as e: logging.error(e) + def main(): logger.info("Compile Yara Rules") try: @@ -364,7 +365,7 @@ def main(): default_rules = os.path.join(pastehunter_path, "YaraRules") else: default_rules = False - + if conf["yara"]["custom_rules"] != "none": custom_rules = conf["yara"]["custom_rules"] else: @@ -376,7 +377,7 @@ def main(): conf['yara']['exclude_rules'], conf['yara']['blacklist'], conf['yara']['test_rules'] - ) + ) rules = yara.compile(filepaths=rule_files, externals={'filename': ''}) @@ -445,5 +446,6 @@ def main(): pool.terminate() pool.join() + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/pytest.ini b/pytest.ini index 8aa36a7..dd6091b 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,3 @@ [pytest] -addopts = test/ \ No newline at end of file +addopts = test/ +norecursedirs = .git build docs logs \ No newline at end of file diff --git a/settings.json.sample b/settings.json.sample index 4a43f5d..d333318 100644 --- a/settings.json.sample +++ b/settings.json.sample @@ -19,7 +19,7 @@ "module": "pastehunter.inputs.dumpz", "api_scrape": "https://dumpz.org/api/recent", "api_raw": "https://dumpz.org/api/dump", - "paste_limit": 200, + "paste_limit": 100, "store_all": false }, "gists": { diff --git a/setup.py b/setup.py index b96fff0..e0d0b8c 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='pastehunter', - version='1.3.2', + version='1.4.0', author='@kevthehermit @Plazmaz', author_email='info@pastehunter.com', description="Pastehunter", diff --git a/test/test_paste_objects.py b/test/test_paste_objects.py index ce85bed..6534d05 100644 --- a/test/test_paste_objects.py +++ b/test/test_paste_objects.py @@ -31,6 +31,7 @@ def test_slexy_site(): assert paste == 'pid_is_' + str(pid) assert paste_data == {"pid": 123} + def test_pastebin_site_remap(): fake_conf = { 'inputs': { @@ -47,8 +48,9 @@ def test_pastebin_site_remap(): } pastebin_site = PastebinPasteSite(fake_conf) out = pastebin_site.remap_raw_item(data) - assert out == {'key': 'a', 'test': 'b', 'date': '1582595793', 'filename': 'a', 'confname': 'pastebin', - 'pasteid': 'a', 'pastesite': 'pastebin.com', '@timestamp': '2020-02-25T01:56:33'} + assert out == {'key': 'a', 'test': 'b', 'date': '1582595793', 'filename': 'a', 'confname': 'pastebin', + 'pasteid': 'a', 'pastesite': 'pastebin.com', '@timestamp': '2020-02-25T01:56:33'} + def test_pastebin_site(): fake_conf = { @@ -68,7 +70,7 @@ def test_pastebin_site(): { 'key': 'bc', 'date': '1582595793' - } + } ]) pastes, paste_ids = pastebin_site.get_recent_items([]) assert paste_ids == ['ab', 'bc']