From 76cd11e8710e736a1da91326cadf53a29f43db5b Mon Sep 17 00:00:00 2001 From: Zdenek Styblik Date: Wed, 5 Jun 2024 10:19:32 +0200 Subject: [PATCH] Add support for and utilize HTTP headers ETag/Last-Modified --- gh2slack.py | 11 +- phpbb2slack.py | 67 +++++++---- rss2irc.py | 177 +++++++++++++++++++++------- rss2slack.py | 38 ++++--- tests/test_gh2slack.py | 6 +- tests/test_phpbb2slack.py | 173 +++++++++++++++++++++++----- tests/test_rss2irc.py | 234 +++++++++++++++++++++++++++++++++++++- tests/test_rss2slack.py | 152 ++++++++++++++++++++++--- 8 files changed, 724 insertions(+), 134 deletions(-) diff --git a/gh2slack.py b/gh2slack.py index d13f824..57b9423 100755 --- a/gh2slack.py +++ b/gh2slack.py @@ -40,7 +40,10 @@ def format_message( try: title = cache_item["title"].encode("utf-8") except UnicodeEncodeError: - logger.error("Failed to encode title as UTF-8: %s", repr(title)) + logger.error( + "Failed to encode title as UTF-8: %s", + repr(cache_item.get("title", None)), + ) logger.error(traceback.format_exc()) title = "Unknown title due to UTF-8 exception, {:s}#{:d}".format( section, cache_item["number"] @@ -157,7 +160,7 @@ def main(): sys.exit(0) cache = rss2irc.read_cache(logger, args.cache) - scrub_cache(logger, cache) + scrub_items(logger, cache) # Note: I have failed to find web link to repo in GH response. # Therefore, let's create one. @@ -220,7 +223,7 @@ def parse_args() -> argparse.Namespace: "--cache-expiration", dest="cache_expiration", type=int, - default=rss2irc.EXPIRATION, + default=rss2irc.CACHE_EXPIRATION, help="Time, in seconds, for how long to keep items " "in cache.", ) parser.add_argument( @@ -344,7 +347,7 @@ def process_page_items( return to_publish -def scrub_cache(logger: logging.Logger, cache: rss2irc.CachedData) -> None: +def scrub_items(logger: logging.Logger, cache: rss2irc.CachedData) -> None: """Scrub cache and remove expired items.""" time_now = int(time.time()) for key in list(cache.items.keys()): diff --git a/phpbb2slack.py b/phpbb2slack.py index ff99877..3ddf277 100755 --- a/phpbb2slack.py +++ b/phpbb2slack.py @@ -81,32 +81,32 @@ def main(): try: slack_token = rss2slack.get_slack_token() authors = get_authors_from_file(logger, args.authors_file) + cache = rss2irc.read_cache(logger, args.cache) + source = cache.get_source_by_url(args.rss_url) + + rsp = rss2irc.get_rss( + logger, + args.rss_url, + args.rss_http_timeout, + source.make_caching_headers(), + ) + if rsp.status_code == 304: + logger.debug("No new RSS data since the last run") + rss2irc.write_cache(cache, args.cache) + sys.exit(0) - data = rss2irc.get_rss(logger, args.rss_url, args.rss_http_timeout) - if not data: + if not rsp.text: logger.error("Failed to get RSS from %s", args.rss_url) sys.exit(1) - news = parse_news(data, authors) + news = parse_news(rsp.text, authors) if not news: logger.info("No news?") sys.exit(0) - cache = rss2irc.read_cache(logger, args.cache) - scrub_cache(logger, cache) - - for key in list(news.keys()): - if key not in cache.items: - continue - - logger.debug("Key %s found in cache", key) - comments_cached = int(cache.items[key]["comments_cnt"]) - comments_actual = int(news[key]["comments_cnt"]) - if comments_cached == comments_actual: - cache.items[key]["expiration"] = ( - int(time.time()) + args.cache_expiration - ) - news.pop(key) + source.extract_caching_headers(rsp.headers) + scrub_items(logger, cache) + prune_news(logger, cache, news, args.cache_expiration) slack_client = rss2slack.get_slack_web_client( slack_token, args.slack_base_url, args.slack_timeout @@ -126,8 +126,8 @@ def main(): finally: time.sleep(args.sleep) - expiration = int(time.time()) + args.cache_expiration - update_cache(cache, news, expiration) + update_items_expiration(cache, news, args.cache_expiration) + cache.scrub_data_sources() rss2irc.write_cache(cache, args.cache) except Exception: logger.debug(traceback.format_exc()) @@ -271,7 +271,27 @@ def parse_news(data: str, authors: List[str]) -> Dict: return news -def scrub_cache(logger: logging.Logger, cache: rss2irc.CachedData) -> None: +def prune_news( + logger: logging.Logger, + cache: rss2irc.CachedData, + news: Dict[str, Dict], + expiration: int = CACHE_EXPIRATION, +) -> None: + """Prune news which already are in cache.""" + item_expiration = int(time.time()) + expiration + for key in list(news.keys()): + if key not in cache.items: + continue + + logger.debug("Key %s found in cache", key) + comments_cached = int(cache.items[key]["comments_cnt"]) + comments_actual = int(news[key]["comments_cnt"]) + if comments_cached == comments_actual: + cache.items[key]["expiration"] = item_expiration + news.pop(key) + + +def scrub_items(logger: logging.Logger, cache: rss2irc.CachedData) -> None: """Scrub cache and remove expired items.""" time_now = int(time.time()) for key in list(cache.items.keys()): @@ -290,13 +310,14 @@ def scrub_cache(logger: logging.Logger, cache: rss2irc.CachedData) -> None: cache.items.pop(key) -def update_cache( +def update_items_expiration( cache: rss2irc.CachedData, news: Dict, expiration: int ) -> None: """Update cache contents.""" + item_expiration = int(time.time()) + expiration for key in list(news.keys()): cache.items[key] = { - "expiration": expiration, + "expiration": item_expiration, "comments_cnt": int(news[key]["comments_cnt"]), } diff --git a/rss2irc.py b/rss2irc.py index f397764..48617f3 100755 --- a/rss2irc.py +++ b/rss2irc.py @@ -21,16 +21,73 @@ import feedparser import requests -EXPIRATION = 86400 # seconds +CACHE_EXPIRATION = 86400 # seconds +DATA_SOURCE_EXPIRATION = 30 * 86400 # seconds HTTP_TIMEOUT = 30 # seconds +@dataclass +class HTTPSource: + """Class represents HTTP data source.""" + + http_etag: str = field(default_factory=str) + http_last_modified: str = field(default_factory=str) + last_used_ts: int = 0 + url: str = field(default_factory=str) + + def extract_caching_headers(self, headers: dict): + """Extract cache related headers from given dict.""" + self.http_etag = "" + self.http_last_modified = "" + for key, value in headers.items(): + key = key.lower() + if key == "etag": + self.http_etag = value + elif key == "last-modified": + self.http_last_modified = value + + def make_caching_headers(self) -> Dict[str, str]: + """Return cache related headers as a dict.""" + headers = {} + if self.http_etag: + headers["if-none-match"] = self.http_etag + + if self.http_last_modified: + headers["if-modified-since"] = self.http_last_modified + + return headers + + @dataclass class CachedData: """CachedData represents locally cached data and state.""" + data_sources: dict = field(default_factory=dict) items: dict = field(default_factory=dict) + def get_source_by_url(self, url: str): + """Return source by URL. + + If source doesn't exist, it will be created. + """ + source = self.data_sources.get(url, None) + if source: + source.last_used_ts = int(time.time()) + return source + + self.data_sources[url] = HTTPSource( + last_used_ts=int(time.time()), url=url + ) + return self.get_source_by_url(url) + + def scrub_data_sources(self, expiration: int = DATA_SOURCE_EXPIRATION): + """Delete expired data sources.""" + now = int(time.time()) + for key in list(self.data_sources.keys()): + diff = now - self.data_sources[key].last_used_ts + if int(diff) > expiration: + self.data_sources.pop(key) + def format_message( url: str, msg_attrs: Tuple[str, str], handle: str = "" @@ -53,18 +110,24 @@ def format_message( def get_rss( - logger: logging.Logger, url: str, timeout: int = HTTP_TIMEOUT -) -> str: + logger: logging.Logger, + url: str, + timeout: int = HTTP_TIMEOUT, + extra_headers: Dict = None, +) -> requests.models.Response: """Return body of given URL as a string.""" # Randomize user agent, because CF likes to block for no apparent reason. - logger.debug("Get %s", url) user_agent = "rss2irc_{:d}".format(int(time.time())) - rsp = requests.get(url, timeout=timeout, headers={"User-Agent": user_agent}) + headers = {"User-Agent": user_agent} + if extra_headers: + for key, value in extra_headers.items(): + headers[key] = value + + logger.debug("Get %s", url) + rsp = requests.get(url, timeout=timeout, headers=headers) logger.debug("Got HTTP Status Code: %i", rsp.status_code) rsp.raise_for_status() - data = rsp.text - del rsp - return data + return rsp def main(): @@ -84,32 +147,39 @@ def main(): sys.exit(1) try: - data = get_rss(logger, args.rss_url, args.rss_http_timeout) - if not data: + cache = read_cache(logger, args.cache) + source = cache.get_source_by_url(args.rss_url) + + rsp = get_rss( + logger, + args.rss_url, + args.rss_http_timeout, + source.make_caching_headers(), + ) + if rsp.status_code == 304: + logger.debug("No new RSS data since the last run") + write_cache(cache, args.cache) + sys.exit(0) + + if not rsp.text: logger.error("Failed to get RSS from %s", args.rss_url) sys.exit(1) - news = parse_news(data) + news = parse_news(rsp.text) if not news: logger.info("No news?") + write_cache(cache, args.cache) sys.exit(0) - cache = read_cache(logger, args.cache) - scrub_cache(logger, cache) - - for key in list(news.keys()): - if key in cache.items: - logger.debug("Key %s found in cache", key) - cache.items[key] = int(time.time()) + args.cache_expiration - news.pop(key) + source.extract_caching_headers(rsp.headers) + scrub_items(logger, cache) + prune_news(logger, cache, news, args.cache_expiration) if not args.cache_init: write_data(logger, news, args.output, args.handle, args.sleep) - expiration = int(time.time()) + args.cache_expiration - for key in list(news.keys()): - cache.items[key] = expiration - + update_items_expiration(cache, news, args.cache_expiration) + cache.scrub_data_sources() write_cache(cache, args.cache) # TODO(zstyblik): remove error file except Exception: @@ -171,7 +241,7 @@ def parse_args() -> argparse.Namespace: "--cache-expiration", dest="cache_expiration", type=int, - default=EXPIRATION, + default=CACHE_EXPIRATION, help="Time, in seconds, for how long to keep items in cache.", ) parser.add_argument( @@ -210,30 +280,51 @@ def parse_news(data: str) -> Dict[str, Tuple[str, str]]: return news +def prune_news( + logger: logging.Logger, + cache: CachedData, + news: Dict[str, Tuple[str, str]], + expiration: int = CACHE_EXPIRATION, +) -> None: + """Prune news which already are in cache.""" + item_expiration = int(time.time()) + expiration + for key in list(news.keys()): + if key in cache.items: + logger.debug("Key %s found in cache", key) + cache.items[key] = item_expiration + news.pop(key) + + def read_cache(logger: logging.Logger, cache_file: str) -> CachedData: """Read file with Py pickle in it.""" if not cache_file: return CachedData() - if not os.path.exists(cache_file): - logger.warning("Cache file '%s' doesn't exist.", cache_file) - return CachedData() - - with open(cache_file, "rb") as fhandle: - try: + try: + with open(cache_file, "rb") as fhandle: cache = pickle.load(fhandle) - except EOFError: - # Note: occurred with empty file. - cache = CachedData() - logger.debug( - "Cache file is probably empty: %s", traceback.format_exc() - ) + except FileNotFoundError: + cache = CachedData() + logger.warning("Cache file '%s' doesn't exist.", cache_file) + except EOFError: + # Note: occurred with empty file. + cache = CachedData() + logger.debug( + "Cache file '%s' is probably empty: %s", + cache_file, + traceback.format_exc(), + ) logger.debug(cache) return cache -def scrub_cache(logger: logging.Logger, cache: CachedData) -> None: +def signal_handler(signum, frame): + """Handle SIGALRM signal.""" + raise ValueError + + +def scrub_items(logger: logging.Logger, cache: CachedData) -> None: """Scrub cache and remove expired items.""" time_now = time.time() for key in list(cache.items.keys()): @@ -252,9 +343,15 @@ def scrub_cache(logger: logging.Logger, cache: CachedData) -> None: cache.items.pop(key) -def signal_handler(signum, frame): - """Handle SIGALRM signal.""" - raise ValueError +def update_items_expiration( + cache: CachedData, + news: Dict[str, Tuple[str, str]], + expiration: int = CACHE_EXPIRATION, +): + """Update expiration of items in cache based on news dict.""" + item_expiration = int(time.time()) + expiration + for key in list(news.keys()): + cache.items[key] = item_expiration def write_cache(data: CachedData, cache_file: str) -> None: diff --git a/rss2slack.py b/rss2slack.py index 3e954a6..9382c1f 100755 --- a/rss2slack.py +++ b/rss2slack.py @@ -78,24 +78,32 @@ def main(): try: slack_token = get_slack_token() - data = rss2irc.get_rss(logger, args.rss_url, args.rss_http_timeout) - if not data: + cache = rss2irc.read_cache(logger, args.cache) + source = cache.get_source_by_url(args.rss_url) + + rsp = rss2irc.get_rss( + logger, + args.rss_url, + args.rss_http_timeout, + source.make_caching_headers(), + ) + if rsp.status_code == 304: + logger.debug("No new RSS data since the last run") + rss2irc.write_cache(cache, args.cache) + sys.exit(0) + + if not rsp.text: logger.error("Failed to get RSS from %s", args.rss_url) sys.exit(1) - news = rss2irc.parse_news(data) + news = rss2irc.parse_news(rsp.text) if not news: logger.info("No news?") sys.exit(0) - cache = rss2irc.read_cache(logger, args.cache) - rss2irc.scrub_cache(logger, cache) - - for key in list(news.keys()): - if key in cache.items: - logger.debug("Key %s found in cache", key) - cache.items[key] = int(time.time()) + args.cache_expiration - news.pop(key) + source.extract_caching_headers(rsp.headers) + rss2irc.scrub_items(logger, cache) + rss2irc.prune_news(logger, cache, news, args.cache_expiration) slack_client = get_slack_web_client( slack_token, @@ -117,10 +125,8 @@ def main(): finally: time.sleep(args.sleep) - expiration = int(time.time()) + args.cache_expiration - for key in list(news.keys()): - cache.items[key] = expiration - + rss2irc.update_items_expiration(cache, news, args.cache_expiration) + cache.scrub_data_sources() rss2irc.write_cache(cache, args.cache) # TODO(zstyblik): remove error file except Exception: @@ -146,7 +152,7 @@ def parse_args() -> argparse.Namespace: "--cache-expiration", dest="cache_expiration", type=int, - default=rss2irc.EXPIRATION, + default=rss2irc.CACHE_EXPIRATION, help="Time, in seconds, for how long to keep items in cache.", ) parser.add_argument( diff --git a/tests/test_gh2slack.py b/tests/test_gh2slack.py index 4bfa1bd..ce27125 100644 --- a/tests/test_gh2slack.py +++ b/tests/test_gh2slack.py @@ -388,8 +388,8 @@ def test_process_page_items(): assert to_publish == expected_to_publish -def test_scrub_cache(): - """Test scrub_cache().""" +def test_scrub_items(): + """Test scrub_items().""" item_expiration = int(time.time()) + 60 test_cache = rss2irc.CachedData( items={ @@ -411,6 +411,6 @@ def test_scrub_cache(): } logger = logging.getLogger("test") - gh2slack.scrub_cache(logger, test_cache) + gh2slack.scrub_items(logger, test_cache) assert test_cache.items == expected diff --git a/tests/test_phpbb2slack.py b/tests/test_phpbb2slack.py index 1970b64..e31883c 100644 --- a/tests/test_phpbb2slack.py +++ b/tests/test_phpbb2slack.py @@ -117,22 +117,6 @@ def test_main_ideal( "https://phpbb.example.com/threads/something-of-something.424837/", ] expected_slack_channel = "test" - - # Mock/set SLACK_TOKEN - monkeypatch.setenv("SLACK_TOKEN", "test") - # Mock HTTP RSS - rss_fname = os.path.join(SCRIPT_PATH, "files", "phpbb-rss.xml") - with open(rss_fname, "rb") as fhandle: - rss_data = fhandle.read().decode("utf-8") - - mock_http_rss = fixture_mock_requests.get(rss_url, text=rss_data) - # Mock Slack HTTP request - fixture_http_server.serve_content( - '{"ok": "true", "error": ""}', - 200, - {"Content-Type": "application/json"}, - ) - fixture_http_server.capture_requests = True expected_slack_requests = [ { "blocks": [ @@ -151,8 +135,38 @@ def test_main_ideal( "channel": expected_slack_channel, } ] + + # Mock/set SLACK_TOKEN + monkeypatch.setenv("SLACK_TOKEN", "test") + # Mock HTTP RSS + rss_fname = os.path.join(SCRIPT_PATH, "files", "phpbb-rss.xml") + with open(rss_fname, "rb") as fhandle: + rss_data = fhandle.read().decode("utf-8") + + mock_http_rss = fixture_mock_requests.get( + rss_url, + text=rss_data, + headers={"ETag": "pytest_etag", "Last-Modified": "pytest_lm"}, + ) + # Mock Slack HTTP request + fixture_http_server.serve_content( + '{"ok": "true", "error": ""}', + 200, + {"Content-Type": "application/json"}, + ) + fixture_http_server.capture_requests = True + + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "" + source1.http_last_modified = "" + source1.last_used_ts = int(time.time()) - 2 * 86400 + source2 = cache.get_source_by_url("http://delete.example.com") + source2.last_used_ts = int(time.time()) - 2 * rss2irc.DATA_SOURCE_EXPIRATION + rss2irc.write_cache(cache, fixture_cache_file) # authors_file = os.path.join(SCRIPT_PATH, "files", "authors.txt") + logger = logging.getLogger("test") exception = None args = [ "./phpbb2slack.py", @@ -196,10 +210,16 @@ def test_main_ideal( assert exception.code == 0 assert out.getvalue().strip() == "" # Check cache and keys in it - logger = logging.getLogger("test") cache = rss2irc.read_cache(logger, fixture_cache_file) print("Cache: {}".format(cache)) assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_lm" + assert source.last_used_ts > int(time.time()) - 60 + assert "http://delete.example.com" not in cache.data_sources # Check HTTP RSS mock assert mock_http_rss.called is True assert mock_http_rss.call_count == 1 @@ -214,6 +234,103 @@ def test_main_ideal( assert data == expected_slack_requests[0] +def test_main_cache_hit( + monkeypatch, fixture_mock_requests, fixture_cache_file, fixture_http_server +): + """Test that HTTP Status Code 304 is handled as expected.""" + handle = "test" + http_timeout = "10" + rss_url = "http://rss.example.com" + expected_cache_keys = [] + expected_slack_channel = "test" + + # Mock/set SLACK_TOKEN + monkeypatch.setenv("SLACK_TOKEN", "test") + # Mock HTTP RSS + mock_http_rss = fixture_mock_requests.get( + rss_url, + status_code=304, + text="", + headers={"ETag": "pytest_etag", "Last-Modified": "pytest_lm"}, + ) + # Mock Slack HTTP request + fixture_http_server.serve_content( + "Should not be called", + 500, + {"Content-Type": "application/json"}, + ) + fixture_http_server.capture_requests = True + + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "pytest_etag" + source1.http_last_modified = "pytest_lm" + source1.last_used_ts = int(time.time()) - 2 * 86400 + rss2irc.write_cache(cache, fixture_cache_file) + # + authors_file = os.path.join(SCRIPT_PATH, "files", "authors.txt") + logger = logging.getLogger("test") + exception = None + args = [ + "./phpbb2slack.py", + "--authors-of-interest", + authors_file, + "--cache", + fixture_cache_file, + "--handle", + handle, + "--rss-url", + rss_url, + "--rss-http-timeout", + http_timeout, + "--slack-base-url", + fixture_http_server.url, + "--slack-channel", + expected_slack_channel, + "--slack-timeout", + "10", + "-v", + ] + + print("RSS URL: {:s}".format(rss_url)) + print("Slack URL: {:s}".format(fixture_http_server.url)) + print("Handle: {:s}".format(handle)) + print("Cache file: {:s}".format(fixture_cache_file)) + + saved_stdout = sys.stdout + out = io.StringIO() + sys.stdout = out + + with patch.object(sys, "argv", args): + try: + phpbb2slack.main() + except SystemExit as sys_exit: + exception = sys_exit + finally: + sys.stdout = saved_stdout + + assert isinstance(exception, SystemExit) is True + assert exception.code == 0 + assert out.getvalue().strip() == "" + # Check cache and keys in it + cache = rss2irc.read_cache(logger, fixture_cache_file) + print("Cache: {}".format(cache)) + assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_lm" + assert source.last_used_ts > int(time.time()) - 60 + assert "http://delete.example.com" not in cache.data_sources + # Check HTTP RSS mock + assert mock_http_rss.called is True + assert mock_http_rss.call_count == 1 + assert mock_http_rss.last_request.text is None + # Check HTTP Slack + assert len(fixture_http_server.requests) == 0 + + @pytest.mark.parametrize( "cache,expected_cache", [ @@ -239,11 +356,11 @@ def test_main_ideal( ) ], ) -def test_scrub_cache(cache, expected_cache): - """Test scrub_cache().""" +def test_scrub_items(cache, expected_cache): + """Test scrub_items().""" logger = logging.getLogger() logger.disabled = True - phpbb2slack.scrub_cache(logger, cache) + phpbb2slack.scrub_items(logger, cache) assert cache.items == expected_cache @@ -269,19 +386,23 @@ def test_scrub_cache(cache, expected_cache): ), { "http://example.com": { - "expiration": get_item_expiration() + 60, + "expiration": 1717576487 + 60, "comments_cnt": 2, }, "http://www.example.com": { - "expiration": get_item_expiration() + 60, + "expiration": 1717576487 + 60, "comments_cnt": 20, }, }, - get_item_expiration() + 60, + 60, ) ], ) -def test_update_cache(news, cache, expected_cache, item_expiration): - """Test update_cache().""" - phpbb2slack.update_cache(cache, news, item_expiration) +@patch("phpbb2slack.time.time") +def test_update_items_expiration( + mock_time, news, cache, expected_cache, item_expiration +): + """Test update_items_expiration().""" + mock_time.return_value = 1717576487 + phpbb2slack.update_items_expiration(cache, news, item_expiration) assert cache.items == expected_cache diff --git a/tests/test_rss2irc.py b/tests/test_rss2irc.py index b676ad3..34c99df 100644 --- a/tests/test_rss2irc.py +++ b/tests/test_rss2irc.py @@ -14,6 +14,119 @@ SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) +@pytest.mark.parametrize( + "source,input_data,expected", + [ + # No attrs should bet set + ( + rss2irc.HTTPSource(), + {}, + {"etag": "", "last_modified": ""}, + ), + # Reset aatrs + ( + rss2irc.HTTPSource( + http_etag="et_test", http_last_modified="lm_test" + ), + {"header1": "firt", "header2": "second"}, + {"etag": "", "last_modified": ""}, + ), + # Set attrs + ( + rss2irc.HTTPSource( + http_etag="et_test", http_last_modified="lm_test" + ), + {"ETag": "test123", "Last-Modified": "abc123", "some": "header"}, + {"etag": "test123", "last_modified": "abc123"}, + ), + ], +) +def test_http_source_extract_caching_headers(source, input_data, expected): + """Test that HTTPSource.extract_caching_headers() works as expected.""" + source.extract_caching_headers(input_data) + assert source.http_etag == expected["etag"] + assert source.http_last_modified == expected["last_modified"] + + +@pytest.mark.parametrize( + "source,expected", + [ + ( + rss2irc.HTTPSource(), + {}, + ), + ( + rss2irc.HTTPSource(http_etag="et_test"), + {"if-none-match": "et_test"}, + ), + ( + rss2irc.HTTPSource(http_last_modified="lm_test"), + {"if-modified-since": "lm_test"}, + ), + ( + rss2irc.HTTPSource( + http_etag="et_test", http_last_modified="lm_test" + ), + {"if-modified-since": "lm_test", "if-none-match": "et_test"}, + ), + ], +) +def test_http_source_make_caching_headers(source, expected): + """Test that HTTPSource.make_caching_headers() works as expected.""" + result = source.make_caching_headers() + assert result == expected + + +@patch("rss2irc.time.time") +def test_cache_get_source_by_url(mock_time): + """Test that CachedData.get_source_by_url() sets last_used_ts attr.""" + mock_time.return_value = 1717428213 + url = "http://example.com" + source = rss2irc.HTTPSource( + last_used_ts=0, + url=url, + ) + cache = rss2irc.CachedData( + data_sources={ + url: source, + } + ) + result = cache.get_source_by_url(url) + assert result == source + assert result.last_used_ts == 1717428213 + + +def test_cache_scrub_data_sources_empty(cache): + """Test that CachedData.scrub_data_sources() when there are no sources.""" + cache = rss2irc.CachedData() + assert not cache.data_sources + cache.scrub_data_sources() + assert not cache.data_sources + + +def test_cache_scrub_data_sources(cache): + """Test that CachedData.scrub_data_sources() expired source is removed.""" + source1_url = "http://ww1.example.com" + source2_url = "http://ww2.example.com" + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(source1_url) + assert source1.last_used_ts > 0 + source1.last_used_ts = int(time.time()) - 2 * rss2irc.DATA_SOURCE_EXPIRATION + + source2 = cache.get_source_by_url(source2_url) + assert source2.last_used_ts > 0 + + assert "http://ww1.example.com" in cache.data_sources + assert source1.url == source1_url + assert "http://ww2.example.com" in cache.data_sources + assert source2.url == source2_url + + cache.scrub_data_sources() + + assert "http://ww1.example.com" not in cache.data_sources + assert "http://ww2.example.com" in cache.data_sources + + @pytest.mark.parametrize( "url,msg_attrs,handle,expected", [ @@ -60,15 +173,113 @@ def test_main_ideal( ), ] - logger = logging.getLogger("test") + mock_s_isfifo.return_value = True + rss_url = fixture_http_server.url rss_fname = os.path.join(SCRIPT_PATH, "files", "rss.xml") with open(rss_fname, "rb") as fhandle: - fixture_http_server.serve_content(fhandle.read().decode("utf-8"), 200) + fixture_http_server.serve_content( + fhandle.read().decode("utf-8"), + 200, + {"ETag": "pytest_etag", "Last-Modified": "pytest_lm"}, + ) + + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "" + source1.http_last_modified = "" + source1.last_used_ts = int(time.time()) - 2 * 86400 + source2 = cache.get_source_by_url("http://delete.example.com") + source2.last_used_ts = int(time.time()) - 2 * rss2irc.DATA_SOURCE_EXPIRATION + rss2irc.write_cache(cache, fixture_cache_file) + + logger = logging.getLogger("test") + exception = None + args = [ + "./rss2irc.py", + "--rss-url", + rss_url, + "--rss-http-timeout", + http_timeout, + "--handle", + handle, + "--cache", + fixture_cache_file, + "--output", + fixture_output_file, + ] + + print("URL: {:s}".format(rss_url)) + print("Handle: {:s}".format(handle)) + print("Cache file: {:s}".format(fixture_cache_file)) + print("Output file: {:s}".format(fixture_output_file)) + + saved_stdout = sys.stdout + out = io.StringIO() + sys.stdout = out + + with patch.object(sys, "argv", args): + try: + rss2irc.main() + except SystemExit as sys_exit: + exception = sys_exit + finally: + sys.stdout = saved_stdout + + with open(fixture_output_file, "rb") as fhandle: + output = fhandle.readlines() + + assert isinstance(exception, SystemExit) is True + assert exception.code == 0 + assert out.getvalue().strip() == "" + assert mock_s_isfifo.called is True + # Check cache - keys in it and sources + cache = rss2irc.read_cache(logger, fixture_cache_file) + print("Cache: {}".format(cache)) + assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_lm" + assert source.last_used_ts > int(time.time()) - 60 + assert "http://delete.example.com" not in cache.data_sources + # check output file + assert sorted(output) == sorted(expected_output) + + +@patch("rss2irc.stat.S_ISFIFO") +def test_main_cache_hit( + mock_s_isfifo, + fixture_mock_requests, + fixture_cache_file, + fixture_output_file, +): + """Test that HTTP Status Code 304 is handled as expected.""" + handle = "test" + http_timeout = "10" + rss_url = "http://rss.example.com" + expected_cache_keys = [] + expected_output = [] mock_s_isfifo.return_value = True + mock_http_rss = fixture_mock_requests.get( + rss_url, + status_code=304, + text="", + headers={ + "ETag": "pytest_etag", + "Last-Modified": "pytest_last_modified", + }, + ) - rss_url = fixture_http_server.url + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "pytest_etag" + source1.http_last_modified = "pytest_last_modified" + source1.last_used_ts = int(time.time()) - 2 * 86400 + rss2irc.write_cache(cache, fixture_cache_file) + logger = logging.getLogger("test") exception = None args = [ "./rss2irc.py", @@ -107,16 +318,27 @@ def test_main_ideal( assert isinstance(exception, SystemExit) is True assert exception.code == 0 assert out.getvalue().strip() == "" + assert mock_s_isfifo.called is False + # Check HTTP call + assert mock_http_rss.called is True + assert mock_http_rss.call_count == 1 + assert mock_http_rss.last_request.text is None # Check cache and keys in it cache = rss2irc.read_cache(logger, fixture_cache_file) print("Cache: {}".format(cache)) assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_last_modified" + assert source.last_used_ts > int(time.time()) - 60 # check output file assert sorted(output) == sorted(expected_output) -def test_scrub_cache(): - """Test scrub_cache().""" +def test_scrub_items(): + """Test scrub_items().""" logging.basicConfig(level=logging.CRITICAL) logger = logging.getLogger() logger.disabled = True @@ -132,5 +354,5 @@ def test_scrub_cache(): expected = { "foo": item_expiration, } - rss2irc.scrub_cache(logger, test_cache) + rss2irc.scrub_items(logger, test_cache) assert test_cache.items == expected diff --git a/tests/test_rss2slack.py b/tests/test_rss2slack.py index 0917185..da3f146 100644 --- a/tests/test_rss2slack.py +++ b/tests/test_rss2slack.py @@ -5,6 +5,7 @@ import logging import os import sys +import time from unittest.mock import patch import pytest @@ -94,22 +95,6 @@ def test_main_ideal( "http://www.example.com/scan.php?page=news_item&px=item2", ] expected_slack_channel = "test" - - # Mock/set SLACK_TOKEN - monkeypatch.setenv("SLACK_TOKEN", "test") - # Mock HTTP RSS - rss_fname = os.path.join(SCRIPT_PATH, "files", "rss.xml") - with open(rss_fname, "rb") as fhandle: - rss_data = fhandle.read().decode("utf-8") - - mock_http_rss = fixture_mock_requests.get(rss_url, text=rss_data) - # Mock Slack HTTP request - fixture_http_server.serve_content( - '{"ok": "true", "error": ""}', - 200, - {"Content-Type": "application/json"}, - ) - fixture_http_server.capture_requests = True expected_slack_requests = [ { "blocks": [ @@ -142,6 +127,38 @@ def test_main_ideal( "channel": expected_slack_channel, }, ] + # Mock/set SLACK_TOKEN + monkeypatch.setenv("SLACK_TOKEN", "test") + # Mock HTTP RSS + rss_fname = os.path.join(SCRIPT_PATH, "files", "rss.xml") + with open(rss_fname, "rb") as fhandle: + rss_data = fhandle.read().decode("utf-8") + + mock_http_rss = fixture_mock_requests.get( + rss_url, + text=rss_data, + headers={ + "ETag": "pytest_etag", + "Last-Modified": "pytest_lm", + }, + ) + # Mock Slack HTTP request + fixture_http_server.serve_content( + '{"ok": "true", "error": ""}', + 200, + {"Content-Type": "application/json"}, + ) + fixture_http_server.capture_requests = True + + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "" + source1.http_last_modified = "" + source1.last_used_ts = int(time.time()) - 2 * 86400 + source2 = cache.get_source_by_url("http://delete.example.com") + source2.last_used_ts = int(time.time()) - 2 * rss2irc.DATA_SOURCE_EXPIRATION + rss2irc.write_cache(cache, fixture_cache_file) + # exception = None args = [ @@ -188,6 +205,13 @@ def test_main_ideal( cache = rss2irc.read_cache(logger, fixture_cache_file) print("Cache: {}".format(cache)) assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_lm" + assert source.last_used_ts > int(time.time()) - 60 + assert "http://delete.example.com" not in cache.data_sources # Check HTTP RSS mock assert mock_http_rss.called is True assert mock_http_rss.call_count == 1 @@ -205,3 +229,99 @@ def test_main_ideal( assert req1[0] == "POST" data = json.loads(req1[1]) assert data == expected_slack_requests[1] + + +def test_main_cache_hit( + monkeypatch, fixture_mock_requests, fixture_cache_file, fixture_http_server +): + """Test that HTTP Status Code 304 is handled as expected.""" + handle = "test" + http_timeout = "10" + rss_url = "http://rss.example.com" + expected_cache_keys = [] + expected_slack_channel = "test" + # Mock/set SLACK_TOKEN + monkeypatch.setenv("SLACK_TOKEN", "test") + # Mock HTTP RSS + mock_http_rss = fixture_mock_requests.get( + rss_url, + status_code=304, + text="", + headers={ + "ETag": "pytest_etag", + "Last-Modified": "pytest_lm", + }, + ) + # Mock Slack HTTP request + fixture_http_server.serve_content( + "Should not be called", + 500, + {"Content-Type": "application/json"}, + ) + fixture_http_server.capture_requests = True + + cache = rss2irc.CachedData() + source1 = cache.get_source_by_url(rss_url) + source1.http_etag = "pytest_etag" + source1.http_last_modified = "pytest_lm" + source1.last_used_ts = int(time.time()) - 2 * 86400 + rss2irc.write_cache(cache, fixture_cache_file) + # + exception = None + args = [ + "./rss2slack.py", + "--rss-url", + rss_url, + "--rss-http-timeout", + http_timeout, + "--handle", + handle, + "--cache", + fixture_cache_file, + "--slack-base-url", + fixture_http_server.url, + "--slack-channel", + expected_slack_channel, + "--slack-timeout", + "10", + "-v", + ] + + print("RSS URL: {:s}".format(rss_url)) + print("Slack URL: {:s}".format(fixture_http_server.url)) + print("Handle: {:s}".format(handle)) + print("Cache file: {:s}".format(fixture_cache_file)) + + saved_stdout = sys.stdout + out = io.StringIO() + sys.stdout = out + + with patch.object(sys, "argv", args): + try: + rss2slack.main() + except SystemExit as sys_exit: + exception = sys_exit + finally: + sys.stdout = saved_stdout + + assert isinstance(exception, SystemExit) is True + assert exception.code == 0 + assert out.getvalue().strip() == "" + # Check cache and keys in it + logger = logging.getLogger("test") + cache = rss2irc.read_cache(logger, fixture_cache_file) + print("Cache: {}".format(cache)) + assert list(cache.items.keys()) == expected_cache_keys + assert rss_url in cache.data_sources.keys() + source = cache.get_source_by_url(rss_url) + assert source.url == rss_url + assert source.http_etag == "pytest_etag" + assert source.http_last_modified == "pytest_lm" + assert source.last_used_ts > int(time.time()) - 60 + # Check HTTP RSS mock + assert mock_http_rss.called is True + assert mock_http_rss.call_count == 1 + assert mock_http_rss.last_request.text is None + # Check HTTP Slack + # Note: this is just a shallow check, but it's better than nothing. + assert len(fixture_http_server.requests) == 0