From d56b65a9be7f5dea42c3c1a72e9783f07925ea82 Mon Sep 17 00:00:00 2001 From: SanderKools-Ordina <83595207+SanderKools-Ordina@users.noreply.github.com> Date: Thu, 3 Aug 2023 10:06:16 +0200 Subject: [PATCH] PGP format validation (#59) * adding pgp valid formatting check * adding improvement on the 'newline' check * added a fix for the BOM issue * added readme for new errors * simplify newline message also in readme * improved 'no_uri' message and added user agent * user agent added * error clarification and some information added to the readme * bumping version --------- Co-authored-by: SanderKools --- README.md | 13 ++++++-- requirements.txt | 11 +++---- sectxt/__init__.py | 56 +++++++++++++++++++++++++++++------ test/test_sectxt.py | 72 +++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 130 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index fbdf604..dae9cf5 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ True >>> from sectxt import SecurityTXT >>> s = SecurityTXT("www.example.com") >>> s.errors -[{'code': 'no_uri', 'message': 'The field value must be an URI', 'line': 2}, {'code': 'no_expire', 'message': 'The Expires field is missing', 'line': None}] +[{'code': 'no_uri', 'message': 'Field policy value must be an URI', 'line': 2}, {'code': 'no_expire', 'message': 'The Expires field is missing', 'line': None}] >>> s.recommendations [{'code': 'long_expiry', 'message': 'Expiry date is more than one year in the future', 'line': 3}] ``` @@ -64,17 +64,19 @@ a dict with three keys: | "no_canonical_match" | "Web URI where security.txt is located must match with a 'Canonical' field. In case of redirecting either the first or last web URI of the redirect chain must match." | | "multi_lang" | "'Preferred-Languages' field must not appear more than once." | | "invalid_lang" | "Value in 'Preferred-Languages' field must match one or more language tags as defined in RFC5646, separated by commas." | -| "no_uri" | "Field value must be a URI (e.g. beginning with 'mailto:')." | +| "no_uri" | "Field '{field}' value must be a URI." | | "no_https" | "Web URI must begin with 'https://'." | | "prec_ws" | "There must be no whitespace before the field separator (colon)." | | "no_space" | "Field separator (colon) must be followed by a space." | | "empty_key" | "Field name must not be empty." | | "empty_value" | "Field value must not be empty." | | "invalid_line" | "Line must contain a field name and value, unless the line is blank or contains a comment." | -| "no_line_separators" | "Every line must end with either a carriage return and line feed characters or just a line feed character" | +| "no_line_separators" | "Every line, including the last one, must end with either a carriage return and line feed characters or just a line feed character" | | "signed_format_issue" | "Signed security.txt must start with the header '-----BEGIN PGP SIGNED MESSAGE-----'. " | | "data_after_sig" | "Signed security.txt must not contain data after the signature." | | "no_csaf_file" | "All CSAF fields must point to a provider-metadata.json file." | +| "pgp_data_error" | "Signed message did not contain a correct ASCII-armored PGP block." | +| "pgp_error" | "Decoding or parsing of the pgp message failed." | ### Possible recommendations @@ -94,6 +96,11 @@ a dict with three keys: | "unknown_field"[2] | "security.txt contains an unknown field. Field {unknown_field} is either a custom field which may not be widely supported, or there is a typo in a standardised field name. | +### Security.txt scraping information + +The scraper attempts to find the security.txt of the given domain in the correct location `/.well-known/security.txt`. It also looks in the old location and with unsecure `http` scheme which would result in validation errors. To prevent possible errors getting the file from the domain a user-agent is added to the header of the request. The user agent that is added is `Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0`, which would mock a browser in firefox with a Windows 7 OS. +If a security.txt file is found that file is than parsed. Any errors, recommendations or notifications that are found would be returned. + --- [1] The security.txt parser will check for the addition of the digital signature, but it will not verify the validity of the signature. diff --git a/requirements.txt b/requirements.txt index ee77996..47852a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -requests -python-dateutil -langcodes -pytest -requests-mock +requests==2.31.0 +python-dateutil==2.8.2 +langcodes==3.3.0 +pytest==7.4.0 +requests-mock==1.11.0 +PGPy==0.6.0 diff --git a/sectxt/__init__.py b/sectxt/__init__.py index 4a74347..2e28066 100644 --- a/sectxt/__init__.py +++ b/sectxt/__init__.py @@ -1,6 +1,8 @@ # # SPDX-License-Identifier: EUPL-1.2 # +import codecs + import langcodes import re import sys @@ -9,6 +11,8 @@ from datetime import datetime, timezone from typing import Optional, Union, List, DefaultDict from urllib.parse import urlsplit, urlunsplit +import pgpy +from pgpy.errors import PGPError if sys.version_info < (3, 8): from typing_extensions import TypedDict @@ -18,7 +22,7 @@ import dateutil.parser import requests -__version__ = "0.8.3" +__version__ = "0.9.0" s = requests.Session() @@ -96,7 +100,10 @@ def _add_error( self, code: str, message: str, + explicit_line_no=None ) -> None: + if explicit_line_no: + self._line_no = explicit_line_no err_dict: ErrorDict = {"code": code, "message": message, "line": self._line_no} self._errors.append(err_dict) @@ -144,6 +151,21 @@ def _parse_line(self, line: str) -> LineDict: "'-----BEGIN PGP SIGNED MESSAGE-----'.", ) self._signed = True + + # Check pgp formatting if signed + try: + pgpy.PGPMessage.from_blob(self._content) + except ValueError: + self._add_error( + "pgp_data_error", + "Signed message did not contain a correct ASCII-armored PGP block." + ) + except PGPError as e: + self._add_error( + "pgp_error", + "Decoding or parsing of the pgp message failed." + ) + return {"type": "pgp_envelope", "field_name": None, "value": line} if line == "-----BEGIN PGP SIGNATURE-----" and self._signed: @@ -199,7 +221,7 @@ def _parse_field(self, line: str) -> LineDict: if url_parts.scheme == "": self._add_error( "no_uri", - "Field value must be a URI (e.g. beginning with 'mailto:').", + f"Field '{key}' value must be a URI.", ) elif url_parts.scheme == "http": self._add_error("no_https", "Web URI must begin with 'https://'.") @@ -284,9 +306,10 @@ def validate_contents(self) -> None: if self.lines[-1]["type"] != "empty": self._add_error( "no_line_separators", - "Every line must end with either a carriage " - "return and line feed characters or just a line " - "feed character", + "Every line, including the last one, must end with " + "either a carriage return and line feed characters " + "or just a line feed character", + len(self.lines) ) if "csaf" in self._values: @@ -393,10 +416,12 @@ def __init__(self, url: str, recommend_unknown_fields: bool = True): def _get_str(self, content: bytes) -> str: try: - return content.decode() + if content.startswith(codecs.BOM_UTF8): + content = content.replace(codecs.BOM_UTF8, b'') + return content.decode('utf-8') except UnicodeError: self._add_error("utf8", "Content must be utf-8 encoded.") - return content.decode(errors="replace") + return content.decode('utf-8', errors="replace") def _process(self) -> None: security_txt_found = False @@ -404,7 +429,13 @@ def _process(self) -> None: for path in [".well-known/security.txt", "security.txt"]: url = urlunsplit((scheme, self._netloc, path, None, None)) try: - resp = requests.get(url, timeout=5) + resp = requests.get( + url, + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) ' + 'Gecko/20100101 Firefox/12.0'}, + timeout=5 + ) except requests.exceptions.SSLError: if not any(d["code"] == "invalid_cert" for d in self._errors): self._add_error( @@ -412,7 +443,14 @@ def _process(self) -> None: "security.txt must be served with a valid TLS certificate.", ) try: - resp = requests.get(url, timeout=5, verify=False) + resp = requests.get( + url, + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) ' + 'Gecko/20100101 Firefox/12.0'}, + timeout=5, + verify=False + ) except: continue except: diff --git a/test/test_sectxt.py b/test/test_sectxt.py index 3c83c48..7ace21e 100644 --- a/test/test_sectxt.py +++ b/test/test_sectxt.py @@ -33,7 +33,11 @@ -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.2 -[signature] +wpwEAQEIABAFAmTHcawJEDs4gPMoG10dAACN5wP/UozhFqHcUWRNhg4KwfY4 +HHXU8bf222naeYJHgaHadLTJJ8YQIQ9N5fYF7K4BM0jPZc48aaUPaBdhNxw+ +KDtQJWPzVREIbbGLRQ5WNYrLR6/7v1LHTI8RvgY22QZD9EAkFQwgdG8paIP4 +2APWewNf8e01t1oh4n5bDBtr4IaQoj0= +=DHXw -----END PGP SIGNATURE----- """ @@ -127,6 +131,37 @@ def test_signed(self): p = Parser(_signed_example) self.assertTrue(p.is_valid()) + def test_signed_invalid_pgp(self): + # Remove required pgp signature header for pgp data error + content = _signed_example.replace( + "-----BEGIN PGP SIGNATURE-----", "" + ) + p1 = Parser(content) + self.assertFalse(p1.is_valid()) + self.assertEqual( + len([1 for r in p1._errors if r["code"] == "pgp_data_error"]), 1 + ) + # Add dash escaping within the pgp signature for pgp data error + content = _signed_example.replace( + "-----BEGIN PGP SIGNATURE-----", "-----BEGIN PGP SIGNATURE-----\n- \n" + ) + p2 = Parser(content) + self.assertFalse(p2.is_valid()) + self.assertEqual( + len([1 for r in p2._errors if r["code"] == "pgp_data_error"]), 1 + ) + # create an error in the pgp message by invalidating the base64 encoding of the signature + content = _signed_example.replace( + "wpwEAQEIABAFAmTHcawJEDs4gPMoG10dAACN5wP/UozhFqHcUWRNhg4KwfY4", "wpwEAQEIABAFAmTH" + ).replace( + "HHXU8bf222naeYJHgaHadLTJJ8YQIQ9N5fYF7K4BM0jPZc48aaUPaBdhNxw+", "HHXU8bf222naeYJHga" + ) + p3 = Parser(content) + self.assertFalse(p3.is_valid()) + self.assertEqual( + len([1 for r in p3._errors if r["code"] == "pgp_error"]), 1 + ) + def test_signed_no_canonical(self): content = _signed_example.replace( "Canonical: https://example.com/.well-known/security.txt", "" @@ -174,13 +209,27 @@ def test_unknown_fields(self): def test_no_line_separators(self): expire_date = (date.today() + timedelta(days=10)).isoformat() single_line_security_txt = ( - f"Contact: mailto:security@example.com Expires: " + "Contact: mailto:security@example.com Expires: " f"{expire_date}T18:37:07z # All on a single line" ) - p = Parser(single_line_security_txt) - self.assertFalse(p.is_valid()) + p_line_separator = Parser(single_line_security_txt) + self.assertFalse(p_line_separator.is_valid()) + self.assertEqual( + len([1 for r in p_line_separator._errors if r["code"] == "no_line_separators"]), 1 + ) + line_length_4_no_carriage_feed = ( + "line 1\n" + "line 2\n" + "line 3\n" + "Contact: mailto:security@example.com Expires" + ) + p_length_4 = Parser(line_length_4_no_carriage_feed) + self.assertFalse(p_length_4.is_valid()) self.assertEqual( - len([1 for r in p._errors if r["code"] == "no_line_separators"]), 1 + len([1 for r in p_length_4._errors if r["code"] == "no_line_separators"]), 1 + ) + self.assertEqual( + [r["line"] for r in p_length_4._errors if r["code"] == "no_line_separators"], [4] ) def test_csaf_https_uri(self): @@ -239,3 +288,16 @@ def test_invalid_uri_scheme(requests_mock: Mocker): s = SecurityTXT("example.com") if not any(d["code"] == "invalid_uri_scheme" for d in s.errors): pytest.fail("invalid_uri_scheme error code should be given") + + +def test_byte_order_mark(requests_mock: Mocker): + with Mocker() as m: + byte_content_with_bom = b'\xef\xbb\xbf\xef\xbb\xbfContact: mailto:me@example.com\n' \ + b'Expires: 2023-08-11T18:37:07z\n' + m.get( + "https://example.com/.well-known/security.txt", + headers={"content-type": "text/plain"}, + content=byte_content_with_bom, + ) + s = SecurityTXT("example.com") + assert(s.is_valid())