Skip to content

Commit

Permalink
Merge pull request #117 from kevthehermit/feature/refactor-inputs
Browse files Browse the repository at this point in the history
  • Loading branch information
Plazmaz authored Nov 23, 2020
2 parents 38ee68d + 938e136 commit 1bf1e88
Show file tree
Hide file tree
Showing 19 changed files with 1,091 additions and 127 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ install:
- pip install -e .
script:
- pastehunter-cli
- pytest
- python -m pytest
after_success:
- python setup.py sdist
deploy:
Expand Down
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.4.0] - 2020-11-22
## Changed
- Added some error state checks and retry logic to pastebin scraping (#116)
- Refactored paste inputs to use a base class

## Added
- Support for ix.io (#95)
- Additional unit tests (pytest still has some issues with import paths on travis)


## [1.3.2] - 2020-02-15
### Changed
Minor patch fixing error in email yara regexp
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,6 @@ PasteHunter is a python3 application that is designed to query a collection of s
For all the pastes it finds it scans the raw contents against a series of Yara rules looking for information that can be used
by an organisation or a researcher.

## Pastebin API Deprecated

We are aware that the pastebin scraping API has been deprectated and are reviewing.


## Setup
For setup instructions please see the official documentation https://pastehunter.readthedocs.io/en/latest/installation.html
Expand Down
Empty file added __init__.py
Empty file.
2 changes: 0 additions & 2 deletions conftest.py

This file was deleted.

6 changes: 6 additions & 0 deletions docs/inputs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ Slexy has some heavy rate limits (30 requests per 30 seconds), but may still ret
- **api_raw**: The URL endpoint for the raw paste.
- **api_view**: The URL enpoint to view the paste.

ix.io
---------

ix.io is a smaller site used primarily for console/command line pastes.

- **store_all**: Store all pastes regardless of a rule match.

StackExchange
-------------
Expand Down
73 changes: 51 additions & 22 deletions pastehunter-cli
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ import yara
import pastehunter
from pastehunter.common import parse_config

VERSION = 1.0
VERSION = '1.4.0'

# Decided not to make this configurable as it currently really only applies to pastebin but may change in functionality later.
# If someone would like this as a config key, please feel free to open an issue or a PR :)
# TODO: @Plazmaz
MAX_ITEM_RETRIES = 5

# Setup Default logging
root = logging.getLogger()
Expand All @@ -33,7 +38,7 @@ logger = logging.getLogger('pastehunter')
logger.setLevel(logging.INFO)

# Version info
logger.info("Starting PasteHunter Version: {0}".format(VERSION))
logger.info("Starting PasteHunter Version: {}".format(VERSION))

# Parse the config file
logger.info("Reading Configs")
Expand All @@ -43,39 +48,45 @@ conf = parse_config()
if not conf:
sys.exit()


class TimeoutError(Exception):
pass


class timeout:
def __init__(self, seconds=1, error_message='Timeout'):
self.seconds = seconds
self.error_message = error_message

def handle_timeout(self, signum, frame):
raise TimeoutError("Process timeout: {0}".format(self.error_message))

def __enter__(self):
signal.signal(signal.SIGALRM, self.handle_timeout)
signal.alarm(self.seconds)

def __exit__(self, type, value, traceback):
signal.alarm(0)



# Set up the log file
if "log" in conf and conf["log"]["log_to_file"]:
if conf["log"]["log_path"] != "":
logfile = "{0}/{1}.log".format(conf["log"]["log_path"], conf["log"]["log_file"])
# Assure directory exists
try: os.makedirs(conf["log"]["log_path"], exist_ok=True) # Python>3.2
try:
os.makedirs(conf["log"]["log_path"], exist_ok=True) # Python>3.2
except TypeError:
try:
os.makedirs(conf["log"]["log_path"])
except OSError as exc: # Python >2.5
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(conf["log"]["log_path"]):
pass
else: logger.error("Can not create log file {0}: {1}".format(conf["log"]["log_path"], exc))
else:
logger.error("Can not create log file {0}: {1}".format(conf["log"]["log_path"], exc))
else:
logfile = "{0}.log".format(conf["log"]["log_file"])
fileHandler = handlers.RotatingFileHandler(logfile, mode='a+', maxBytes=(1048576*5), backupCount=7)
fileHandler = handlers.RotatingFileHandler(logfile, mode='a+', maxBytes=(1048576 * 5), backupCount=7)
if conf["log"]["format"] != "":
fileFormatter = logging.Formatter("{0}".format(conf["log"]["format"]))
fileHandler.setFormatter(fileFormatter)
Expand Down Expand Up @@ -111,7 +122,6 @@ for input_type, input_values in conf["inputs"].items():
input_list.append(input_values["module"])
logger.info("Enabled Input: {0}".format(input_type))


# Configure Outputs
logger.info("Configure Outputs")
outputs = []
Expand Down Expand Up @@ -192,11 +202,31 @@ def paste_scanner(paste_data, rules_buff):
if paste_site == 'slexy.org':
headers['User-Agent'] = 'PasteHunter'

req = requests.get(raw_paste_uri, headers=headers)
if req.status_code == 200:
raw_paste_data = req.text
else:
logger.error("Request returned unexpected response code {}: {}".format(req.status_code, req.text))
attempt_count = 0
while attempt_count < MAX_ITEM_RETRIES:
attempt_count += 1
req = requests.get(raw_paste_uri, headers=headers)
if req.status_code == 200:
raw_paste_data = req.text
if attempt_count > 1:
logger.warning('Successfully resolved 429 exception')
break

# We may want to handle other status codes in the future,
# for now 429 is the only code we retry for, just to avoid issues with
# rate limiting and hammering sites for 404s or outages
elif req.status_code == 429:
logger.warning('Encountered unexpected 429 when requesting item at %s'
+ ' for site "%s". Retrying (attempt %d)...', raw_paste_uri,
paste_site, attempt_count + 1)
sleep(10)
else:
logger.error("Request returned unexpected response code %d: %s", req.status_code,
req.text)

if attempt_count > 1:
logger.error("Unable to resolve 429 exception after %d retries, giving up on item %s.",
MAX_ITEM_RETRIES, raw_paste_uri)

# Cover fetch site SSLErrors
except requests.exceptions.SSLError as e:
Expand Down Expand Up @@ -269,7 +299,6 @@ def paste_scanner(paste_data, rules_buff):
# remove the confname key as its not really needed past this point
del paste_data['confname']


# Blacklist Check
# If any of the blacklist rules appear then empty the result set
blacklisted = False
Expand All @@ -283,7 +312,6 @@ def paste_scanner(paste_data, rules_buff):
return True
return False


# Post Process

# If post module is enabled and the paste has a matching rule.
Expand All @@ -295,14 +323,13 @@ def paste_scanner(paste_data, rules_buff):
logger.info("Running Post Module {0} on {1}".format(post_values["module"], paste_data["pasteid"]))
post_module = importlib.import_module(post_values["module"])
post_results = post_module.run(results,
raw_paste_data,
paste_data
)
raw_paste_data,
paste_data
)

# Throw everything back to paste_data for ease.
paste_data = post_results


# If we have a result add some meta data and send to storage
# If results is empty, ie no match, and store_all is True,
# then append "no_match" to results. This will then force output.
Expand All @@ -329,6 +356,7 @@ def paste_scanner(paste_data, rules_buff):
except Exception as e:
logging.error(e)


def main():
logger.info("Compile Yara Rules")
try:
Expand All @@ -337,7 +365,7 @@ def main():
default_rules = os.path.join(pastehunter_path, "YaraRules")
else:
default_rules = False

if conf["yara"]["custom_rules"] != "none":
custom_rules = conf["yara"]["custom_rules"]
else:
Expand All @@ -349,7 +377,7 @@ def main():
conf['yara']['exclude_rules'],
conf['yara']['blacklist'],
conf['yara']['test_rules']
)
)

rules = yara.compile(filepaths=rule_files, externals={'filename': ''})

Expand Down Expand Up @@ -418,5 +446,6 @@ def main():
pool.terminate()
pool.join()


if __name__ == '__main__':
main()
main()
26 changes: 26 additions & 0 deletions pastehunter/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
logger = logging.getLogger('pastehunter')
home = os.path.expanduser("~")

BASE62_CHARS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
BASE_LOOKUP = dict((c, i) for i, c in enumerate(BASE62_CHARS))
BASE_LEN = len(BASE62_CHARS)

# Parse the config file in to a dict
def parse_config():
conf = None
Expand All @@ -26,3 +30,25 @@ def parse_config():
logger.error("Unable to read config file '~/.config/pastehunter.json'")

return conf


# Most of this was pulled from https://stackoverflow.com/a/2549514
def base62_decode(input: str) -> int:
length = len(BASE_LOOKUP)
ret = 0
for i, c in enumerate(input[::-1]):
ret += (length ** i) * BASE_LOOKUP[c]

return ret


def base62_encode(integer) -> str:
if integer == 0:
return BASE62_CHARS[0]

ret = ''
while integer != 0:
ret = BASE62_CHARS[integer % BASE_LEN] + ret
integer //= BASE_LEN

return ret
52 changes: 52 additions & 0 deletions pastehunter/inputs/base_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from abc import ABC, abstractmethod
from typing import Any, Optional, Dict, List, Union

import requests


class BasePasteSite(ABC):
def make_request(self, url: str, timeout: Optional[int] = 10, headers: Optional[Dict[str, Any]] = None):
"""
Make a request and return the results
:param url: The url to request
:param timeout: The timeout for the request
:param headers: The headers dict
:return:
"""
req = requests.get(url, headers=headers, timeout=timeout)
return req

@abstractmethod
def remap_raw_item(self, raw_item: [str, Dict]) -> Dict[str, Any]:
"""
Takes a raw item and remaps it to a normalize paste dict
:param raw_item:
:return: The paste dict
"""
pass

@abstractmethod
def get_paste_for_id(self, paste_id: Any) -> str:
"""
Returns a paste for the given paste_id
:param paste_id: The paste to retrieve
:return: A raw paste object
"""
pass

@abstractmethod
def get_paste_id(self, paste_obj: Dict[str, Any]) -> Union[str, int]:
"""
Returns an id for the given paste object
:param paste_obj: The raw paste dict
:return: The paste id
passd (str or int)
"""

@abstractmethod
def get_recent_items(self, input_history: List[str]):
"""
Gets recent items
:return: a list of recent items
"""
pass
Loading

0 comments on commit 1bf1e88

Please sign in to comment.