From d56b65a9be7f5dea42c3c1a72e9783f07925ea82 Mon Sep 17 00:00:00 2001
From: SanderKools-Ordina
 <83595207+SanderKools-Ordina@users.noreply.github.com>
Date: Thu, 3 Aug 2023 10:06:16 +0200
Subject: [PATCH] PGP format validation (#59)

* adding pgp valid formatting check

* adding improvement on the 'newline' check

* added a fix for the BOM issue

* added readme for new errors

* simplify newline message also in readme

* improved 'no_uri' message and added user agent

* user agent added

* error clarification and some information added to the readme

* bumping version

---------

Co-authored-by: SanderKools <s.w.a.kools@minezk.nl>
---
 README.md           | 13 ++++++--
 requirements.txt    | 11 +++----
 sectxt/__init__.py  | 56 +++++++++++++++++++++++++++++------
 test/test_sectxt.py | 72 +++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 130 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index fbdf604..dae9cf5 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ True
 >>> from sectxt import SecurityTXT
 >>> s = SecurityTXT("www.example.com")
 >>> s.errors
-[{'code': 'no_uri', 'message': 'The field value must be an URI', 'line': 2}, {'code': 'no_expire', 'message': 'The Expires field is missing', 'line': None}]
+[{'code': 'no_uri', 'message': 'Field policy value must be an URI', 'line': 2}, {'code': 'no_expire', 'message': 'The Expires field is missing', 'line': None}]
 >>> s.recommendations
 [{'code': 'long_expiry', 'message': 'Expiry date is more than one year in the future', 'line': 3}]
 ```
@@ -64,17 +64,19 @@ a dict with three keys:
 | "no_canonical_match"  | "Web URI where security.txt is located must match with a 'Canonical' field. In case of redirecting either the first or last web URI of the redirect chain must match." |
 | "multi_lang"          | "'Preferred-Languages' field must not appear more than once."                                                                                                          |
 | "invalid_lang"        | "Value in 'Preferred-Languages' field must match one or more language tags as defined in RFC5646, separated by commas."                                                |
-| "no_uri"              | "Field value must be a URI (e.g. beginning with 'mailto:')."                                                                                                           |
+| "no_uri"              | "Field '{field}' value must be a URI."                                                                                                                                 |
 | "no_https"            | "Web URI must begin with 'https://'."                                                                                                                                  |
 | "prec_ws"             | "There must be no whitespace before the field separator (colon)."                                                                                                      |
 | "no_space"            | "Field separator (colon) must be followed by a space."                                                                                                                 | 
 | "empty_key"           | "Field name must not be empty."                                                                                                                                        |
 | "empty_value"         | "Field value must not be empty."                                                                                                                                       |
 | "invalid_line"        | "Line must contain a field name and value, unless the line is blank or contains a comment."                                                                            |
-| "no_line_separators"  | "Every line must end with either a carriage return and line feed characters or just a line feed character"                                                             |
+| "no_line_separators"  | "Every line, including the last one, must end with either a carriage return and line feed characters or just a line feed character"                                    |
 | "signed_format_issue" | "Signed security.txt must start with the header '-----BEGIN PGP SIGNED MESSAGE-----'. "                                                                                |
 | "data_after_sig"      | "Signed security.txt must not contain data after the signature."                                                                                                       |
 | "no_csaf_file"        | "All CSAF fields must point to a provider-metadata.json file."                                                                                                         |
+| "pgp_data_error"      | "Signed message did not contain a correct ASCII-armored PGP block."                                                                                                    |
+| "pgp_error"           | "Decoding or parsing of the pgp message failed."                                                                                                                       |
 
 
 ### Possible recommendations
@@ -94,6 +96,11 @@ a dict with three keys:
 | "unknown_field"<sup>[2]</sup> | "security.txt contains an unknown field. Field {unknown_field} is either a custom field which may not be widely supported, or there is a typo in a standardised field name. |
 
 
+### Security.txt scraping information
+
+The scraper attempts to find the security.txt of the given domain in the correct location `/.well-known/security.txt`. It also looks in the old location and with unsecure `http` scheme which would result in validation errors. To prevent possible errors getting the file from the domain a user-agent is added to the header of the request. The user agent that is added is `Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0`, which would mock a browser in firefox with a Windows 7 OS.
+If a security.txt file is found that file is than parsed. Any errors, recommendations or notifications that are found would be returned.
+
 ---
 
 [1] The security.txt parser will check for the addition of the digital signature, but it will not verify the validity of the signature.
diff --git a/requirements.txt b/requirements.txt
index ee77996..47852a4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
-requests
-python-dateutil
-langcodes
-pytest
-requests-mock
+requests==2.31.0
+python-dateutil==2.8.2
+langcodes==3.3.0
+pytest==7.4.0
+requests-mock==1.11.0
+PGPy==0.6.0
diff --git a/sectxt/__init__.py b/sectxt/__init__.py
index 4a74347..2e28066 100644
--- a/sectxt/__init__.py
+++ b/sectxt/__init__.py
@@ -1,6 +1,8 @@
 #
 # SPDX-License-Identifier: EUPL-1.2
 #
+import codecs
+
 import langcodes
 import re
 import sys
@@ -9,6 +11,8 @@
 from datetime import datetime, timezone
 from typing import Optional, Union, List, DefaultDict
 from urllib.parse import urlsplit, urlunsplit
+import pgpy
+from pgpy.errors import PGPError
 
 if sys.version_info < (3, 8):
     from typing_extensions import TypedDict
@@ -18,7 +22,7 @@
 import dateutil.parser
 import requests
 
-__version__ = "0.8.3"
+__version__ = "0.9.0"
 
 s = requests.Session()
 
@@ -96,7 +100,10 @@ def _add_error(
         self,
         code: str,
         message: str,
+        explicit_line_no=None
     ) -> None:
+        if explicit_line_no:
+            self._line_no = explicit_line_no
         err_dict: ErrorDict = {"code": code, "message": message, "line": self._line_no}
         self._errors.append(err_dict)
 
@@ -144,6 +151,21 @@ def _parse_line(self, line: str) -> LineDict:
                     "'-----BEGIN PGP SIGNED MESSAGE-----'.",
                 )
             self._signed = True
+
+            # Check pgp formatting if signed
+            try:
+                pgpy.PGPMessage.from_blob(self._content)
+            except ValueError:
+                self._add_error(
+                    "pgp_data_error",
+                    "Signed message did not contain a correct ASCII-armored PGP block."
+                )
+            except PGPError as e:
+                self._add_error(
+                    "pgp_error",
+                    "Decoding or parsing of the pgp message failed."
+                )
+
             return {"type": "pgp_envelope", "field_name": None, "value": line}
 
         if line == "-----BEGIN PGP SIGNATURE-----" and self._signed:
@@ -199,7 +221,7 @@ def _parse_field(self, line: str) -> LineDict:
             if url_parts.scheme == "":
                 self._add_error(
                     "no_uri",
-                    "Field value must be a URI (e.g. beginning with 'mailto:').",
+                    f"Field '{key}' value must be a URI.",
                 )
             elif url_parts.scheme == "http":
                 self._add_error("no_https", "Web URI must begin with 'https://'.")
@@ -284,9 +306,10 @@ def validate_contents(self) -> None:
         if self.lines[-1]["type"] != "empty":
             self._add_error(
                 "no_line_separators",
-                "Every line must end with either a carriage "
-                "return and line feed characters or just a line "
-                "feed character",
+                "Every line, including the last one, must end with "
+                "either a carriage return and line feed characters "
+                "or just a line feed character",
+                len(self.lines)
             )
 
         if "csaf" in self._values:
@@ -393,10 +416,12 @@ def __init__(self, url: str, recommend_unknown_fields: bool = True):
 
     def _get_str(self, content: bytes) -> str:
         try:
-            return content.decode()
+            if content.startswith(codecs.BOM_UTF8):
+                content = content.replace(codecs.BOM_UTF8, b'')
+            return content.decode('utf-8')
         except UnicodeError:
             self._add_error("utf8", "Content must be utf-8 encoded.")
-        return content.decode(errors="replace")
+        return content.decode('utf-8', errors="replace")
 
     def _process(self) -> None:
         security_txt_found = False
@@ -404,7 +429,13 @@ def _process(self) -> None:
             for path in [".well-known/security.txt", "security.txt"]:
                 url = urlunsplit((scheme, self._netloc, path, None, None))
                 try:
-                    resp = requests.get(url, timeout=5)
+                    resp = requests.get(
+                        url,
+                        headers={
+                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) '
+                                          'Gecko/20100101 Firefox/12.0'},
+                        timeout=5
+                    )
                 except requests.exceptions.SSLError:
                     if not any(d["code"] == "invalid_cert" for d in self._errors):
                         self._add_error(
@@ -412,7 +443,14 @@ def _process(self) -> None:
                             "security.txt must be served with a valid TLS certificate.",
                         )
                     try:
-                        resp = requests.get(url, timeout=5, verify=False)
+                        resp = requests.get(
+                            url,
+                            headers={
+                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) '
+                                              'Gecko/20100101 Firefox/12.0'},
+                            timeout=5,
+                            verify=False
+                        )
                     except:
                         continue
                 except:
diff --git a/test/test_sectxt.py b/test/test_sectxt.py
index 3c83c48..7ace21e 100644
--- a/test/test_sectxt.py
+++ b/test/test_sectxt.py
@@ -33,7 +33,11 @@
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.2
 
-[signature]
+wpwEAQEIABAFAmTHcawJEDs4gPMoG10dAACN5wP/UozhFqHcUWRNhg4KwfY4
+HHXU8bf222naeYJHgaHadLTJJ8YQIQ9N5fYF7K4BM0jPZc48aaUPaBdhNxw+
+KDtQJWPzVREIbbGLRQ5WNYrLR6/7v1LHTI8RvgY22QZD9EAkFQwgdG8paIP4
+2APWewNf8e01t1oh4n5bDBtr4IaQoj0=
+=DHXw
 -----END PGP SIGNATURE-----
 """
 
@@ -127,6 +131,37 @@ def test_signed(self):
         p = Parser(_signed_example)
         self.assertTrue(p.is_valid())
 
+    def test_signed_invalid_pgp(self):
+        # Remove required pgp signature header for pgp data error
+        content = _signed_example.replace(
+            "-----BEGIN PGP SIGNATURE-----", ""
+        )
+        p1 = Parser(content)
+        self.assertFalse(p1.is_valid())
+        self.assertEqual(
+            len([1 for r in p1._errors if r["code"] == "pgp_data_error"]), 1
+        )
+        # Add dash escaping within the pgp signature for pgp data error
+        content = _signed_example.replace(
+            "-----BEGIN PGP SIGNATURE-----", "-----BEGIN PGP SIGNATURE-----\n- \n"
+        )
+        p2 = Parser(content)
+        self.assertFalse(p2.is_valid())
+        self.assertEqual(
+            len([1 for r in p2._errors if r["code"] == "pgp_data_error"]), 1
+        )
+        # create an error in the pgp message by invalidating the base64 encoding of the signature
+        content = _signed_example.replace(
+            "wpwEAQEIABAFAmTHcawJEDs4gPMoG10dAACN5wP/UozhFqHcUWRNhg4KwfY4", "wpwEAQEIABAFAmTH"
+        ).replace(
+            "HHXU8bf222naeYJHgaHadLTJJ8YQIQ9N5fYF7K4BM0jPZc48aaUPaBdhNxw+", "HHXU8bf222naeYJHga"
+        )
+        p3 = Parser(content)
+        self.assertFalse(p3.is_valid())
+        self.assertEqual(
+            len([1 for r in p3._errors if r["code"] == "pgp_error"]), 1
+        )
+
     def test_signed_no_canonical(self):
         content = _signed_example.replace(
             "Canonical: https://example.com/.well-known/security.txt", ""
@@ -174,13 +209,27 @@ def test_unknown_fields(self):
     def test_no_line_separators(self):
         expire_date = (date.today() + timedelta(days=10)).isoformat()
         single_line_security_txt = (
-            f"Contact: mailto:security@example.com  Expires: "
+            "Contact: mailto:security@example.com  Expires: "
             f"{expire_date}T18:37:07z  # All on a single line"
         )
-        p = Parser(single_line_security_txt)
-        self.assertFalse(p.is_valid())
+        p_line_separator = Parser(single_line_security_txt)
+        self.assertFalse(p_line_separator.is_valid())
+        self.assertEqual(
+            len([1 for r in p_line_separator._errors if r["code"] == "no_line_separators"]), 1
+        )
+        line_length_4_no_carriage_feed = (
+            "line 1\n"
+            "line 2\n"
+            "line 3\n"
+            "Contact: mailto:security@example.com  Expires"
+        )
+        p_length_4 = Parser(line_length_4_no_carriage_feed)
+        self.assertFalse(p_length_4.is_valid())
         self.assertEqual(
-            len([1 for r in p._errors if r["code"] == "no_line_separators"]), 1
+            len([1 for r in p_length_4._errors if r["code"] == "no_line_separators"]), 1
+        )
+        self.assertEqual(
+            [r["line"] for r in p_length_4._errors if r["code"] == "no_line_separators"], [4]
         )
 
     def test_csaf_https_uri(self):
@@ -239,3 +288,16 @@ def test_invalid_uri_scheme(requests_mock: Mocker):
         s = SecurityTXT("example.com")
         if not any(d["code"] == "invalid_uri_scheme" for d in s.errors):
             pytest.fail("invalid_uri_scheme error code should be given")
+
+
+def test_byte_order_mark(requests_mock: Mocker):
+    with Mocker() as m:
+        byte_content_with_bom = b'\xef\xbb\xbf\xef\xbb\xbfContact: mailto:me@example.com\n' \
+                                b'Expires: 2023-08-11T18:37:07z\n'
+        m.get(
+            "https://example.com/.well-known/security.txt",
+            headers={"content-type": "text/plain"},
+            content=byte_content_with_bom,
+        )
+        s = SecurityTXT("example.com")
+        assert(s.is_valid())