Merge pull request #29 from zstyblik/refactoring_into_lib

Move CachedData and HTTPSource classes into separate files
zstyblik · Jul 4, 2024 · 085a31a · 085a31a
2 parents 4fbbce1 + 672cdda
commit 085a31a
Show file tree

Hide file tree

Showing 23 changed files with 373 additions and 265 deletions.
diff --git a/cache_stats.py b/cache_stats.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 
 import rss2irc
+from lib import CachedData
 
 BUCKET_COUNT = 10
 
@@ -24,7 +25,7 @@ class Bucket:
 
 
 def calc_distribution(
-    logger: logging.Logger, cache: rss2irc.CachedData, buckets
+    logger: logging.Logger, cache: CachedData, buckets
 ) -> int:
     """Calculate item distribution inside cache."""
     keys = list(buckets.keys())
@@ -70,7 +71,7 @@ def get_timestamp(data) -> int:
 
 
 def get_timestamp_minmax(
-    logger: logging.Logger, cache: rss2irc.CachedData
+    logger: logging.Logger, cache: CachedData
 ) -> (int, int, int):
     """Return timestamp min, max and no. of errors."""
     ts_min = 99999999999

diff --git a/ci/run-reorder-python-imports.sh b/ci/run-reorder-python-imports.sh
@@ -2,4 +2,4 @@
 set -e
 set -u
 
-reorder-python-imports `find . ! -path '*/\.*' -name '*.py'`
+reorder-python-imports --py311-plus `find . ! -path '*/\.*' -name '*.py'`
diff --git a/gh2slack.py b/gh2slack.py
@@ -17,7 +17,9 @@
 import requests
 
 import rss2irc  # noqa: I202
-import rss2slack
+import rss2slack  # noqa: I202
+from lib import CachedData  # noqa: I202
+from lib import config_options  # noqa: I202
 
 ALIASES = {
     "issues": "issue",
@@ -101,17 +103,21 @@ def gh_parse_next_page(link_header: str) -> str:
 
 
 def gh_request(
-    logger: logging.Logger, url: str, timeout: int = rss2irc.HTTP_TIMEOUT
+    logger: logging.Logger, url: str, timeout: int = config_options.HTTP_TIMEOUT
 ) -> List:
     """Return list of responses from GitHub.
 
     Makes request to GH, follows 'Link' header if present, and returns list
     responses.
     """
     logger.debug("Requesting %s", url)
+    user_agent = "gh2slack_{:d}".format(int(time.time()))
     rsp = requests.get(
         url,
-        headers={"Accept": "application/vnd.github.v3+json"},
+        headers={
+            "Accept": "application/vnd.github.v3+json",
+            "User-Agent": user_agent,
+        },
         params={"state": "open", "sort": "created"},
         timeout=timeout,
     )
@@ -223,7 +229,7 @@ def parse_args() -> argparse.Namespace:
         "--cache-expiration",
         dest="cache_expiration",
         type=int,
-        default=rss2irc.CACHE_EXPIRATION,
+        default=config_options.CACHE_EXPIRATION,
         help="Time, in seconds, for how long to keep items " "in cache.",
     )
     parser.add_argument(
@@ -275,9 +281,9 @@ def parse_args() -> argparse.Namespace:
         "--slack-timeout",
         dest="slack_timeout",
         type=int,
-        default=rss2irc.HTTP_TIMEOUT,
+        default=config_options.HTTP_TIMEOUT,
         help="Slack API Timeout. Defaults to {:d} seconds.".format(
-            rss2irc.HTTP_TIMEOUT
+            config_options.HTTP_TIMEOUT
         ),
     )
     parser.add_argument(
@@ -303,7 +309,7 @@ def parse_args() -> argparse.Namespace:
 
 def process_page_items(
     logger: logging.Logger,
-    cache: rss2irc.CachedData,
+    cache: CachedData,
     pages: List,
     expiration: int,
     repository_url: str,
@@ -347,7 +353,7 @@ def process_page_items(
     return to_publish
 
 
-def scrub_items(logger: logging.Logger, cache: rss2irc.CachedData) -> None:
+def scrub_items(logger: logging.Logger, cache: CachedData) -> None:
     """Scrub cache and remove expired items."""
     time_now = int(time.time())
     for key in list(cache.items.keys()):

diff --git a/git_commits2slack.py b/git_commits2slack.py
@@ -16,8 +16,8 @@
 from typing import Dict
 from typing import List
 
-import rss2irc
 import rss2slack
+from lib import config_options
 
 RE_GIT_AUTD = re.compile(r"^Already up-to-date.$")
 RE_GIT_UPDATING = re.compile(r"^Updating [a-z0-9]+", re.I)
@@ -254,9 +254,9 @@ def parse_args() -> argparse.Namespace:
         "--slack-timeout",
         dest="slack_timeout",
         type=int,
-        default=rss2irc.HTTP_TIMEOUT,
+        default=config_options.HTTP_TIMEOUT,
         help="Slack API Timeout. Defaults to {:d} seconds.".format(
-            rss2irc.HTTP_TIMEOUT
+            config_options.HTTP_TIMEOUT
         ),
     )
     parser.add_argument(

diff --git a/lib/__init__.py b/lib/__init__.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+"""Just init.
+
+I love how black and reorder-python-imports play nicely together and no
+workarounds are needed what so ever.
+"""
+from .cached_data import CachedData  # noqa: F401
+from .http_source import HTTPSource  # noqa: F401
diff --git a/lib/cached_data.py b/lib/cached_data.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""Code related to Cache.
+
+I love how black and reorder-python-imports play nicely together and no
+workarounds are needed what so ever.
+"""
+import time
+from dataclasses import dataclass
+from dataclasses import field
+
+from .config_options import DATA_SOURCE_EXPIRATION
+from .http_source import HTTPSource
+
+
+@dataclass
+class CachedData:
+    """CachedData represents locally cached data and state."""
+
+    data_sources: dict = field(default_factory=dict)
+    items: dict = field(default_factory=dict)
+
+    def get_source_by_url(self, url: str) -> HTTPSource:
+        """Return source by URL.
+
+        If source doesn't exist, it will be created.
+        """
+        source = self.data_sources.get(url, None)
+        if source:
+            source.last_used_ts = int(time.time())
+            return source
+
+        self.data_sources[url] = HTTPSource(
+            last_used_ts=int(time.time()), url=url
+        )
+        return self.get_source_by_url(url)
+
+    def scrub_data_sources(
+        self, expiration: int = DATA_SOURCE_EXPIRATION
+    ) -> None:
+        """Delete expired data sources."""
+        now = int(time.time())
+        for key in list(self.data_sources.keys()):
+            diff = now - self.data_sources[key].last_used_ts
+            if int(diff) > expiration:
+                self.data_sources.pop(key)
diff --git a/lib/config_options.py b/lib/config_options.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+"""Common configuration options.
+
+I love how black and reorder-python-imports play nicely together and no
+workarounds are needed what so ever.
+"""
+CACHE_EXPIRATION = 86400  # seconds
+DATA_SOURCE_EXPIRATION = 30 * 86400  # seconds
+HTTP_TIMEOUT = 30  # seconds
diff --git a/lib/http_source.py b/lib/http_source.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""Code related to HTTP Source.
+
+I love how black and reorder-python-imports play nicely together and no
+workarounds are needed what so ever.
+"""
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Dict
+
+
+@dataclass
+class HTTPSource:
+    """Class represents HTTP data source."""
+
+    http_etag: str = field(default_factory=str)
+    http_last_modified: str = field(default_factory=str)
+    last_used_ts: int = 0
+    url: str = field(default_factory=str)
+
+    def extract_caching_headers(self, headers: Dict[str, str]) -> None:
+        """Extract cache related headers from given dict."""
+        self.http_etag = ""
+        self.http_last_modified = ""
+        for key, value in headers.items():
+            key = key.lower()
+            if key == "etag":
+                self.http_etag = value
+            elif key == "last-modified":
+                self.http_last_modified = value
+
+    def make_caching_headers(self) -> Dict[str, str]:
+        """Return cache related headers as a dict."""
+        headers = {}
+        if self.http_etag:
+            headers["if-none-match"] = self.http_etag
+
+        if self.http_last_modified:
+            headers["if-modified-since"] = self.http_last_modified
+
+        return headers
diff --git a/lib/tests/test_cached_data.py b/lib/tests/test_cached_data.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+"""Unit tests for cached_data.py."""
+import time
+from unittest.mock import patch
+
+from lib import CachedData
+from lib import config_options
+from lib import HTTPSource  # noqa: I100
+
+
+@patch("lib.cached_data.time.time")
+def test_cache_get_source_by_url(mock_time):
+    """Test that CachedData.get_source_by_url() sets last_used_ts attr."""
+    mock_time.return_value = 1717428213
+    url = "http://example.com"
+    source = HTTPSource(
+        last_used_ts=0,
+        url=url,
+    )
+    cache = CachedData(
+        data_sources={
+            url: source,
+        }
+    )
+    result = cache.get_source_by_url(url)
+    assert result == source
+    assert result.last_used_ts == 1717428213
+
+
+def test_cache_scrub_data_sources_empty(cache):
+    """Test that CachedData.scrub_data_sources() when there are no sources."""
+    cache = CachedData()
+    assert not cache.data_sources
+    cache.scrub_data_sources()
+    assert not cache.data_sources
+
+
+def test_cache_scrub_data_sources(cache):
+    """Test that CachedData.scrub_data_sources() expired source is removed."""
+    source1_url = "http://ww1.example.com"
+    source2_url = "http://ww2.example.com"
+    cache = CachedData()
+    source1 = cache.get_source_by_url(source1_url)
+    assert source1.last_used_ts > 0
+    source1.last_used_ts = (
+        int(time.time()) - 2 * config_options.DATA_SOURCE_EXPIRATION
+    )
+
+    source2 = cache.get_source_by_url(source2_url)
+    assert source2.last_used_ts > 0
+
+    assert "http://ww1.example.com" in cache.data_sources
+    assert source1.url == source1_url
+    assert "http://ww2.example.com" in cache.data_sources
+    assert source2.url == source2_url
+
+    cache.scrub_data_sources()
+
+    assert "http://ww1.example.com" not in cache.data_sources
+    assert "http://ww2.example.com" in cache.data_sources
diff --git a/lib/tests/test_http_source.py b/lib/tests/test_http_source.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""Unit tests for http_source.py."""
+import pytest
+
+from lib import HTTPSource  # noqa: I202
+
+
+@pytest.mark.parametrize(
+    "source,input_data,expected",
+    [
+        # No attrs should bet set
+        (
+            HTTPSource(),
+            {},
+            {"etag": "", "last_modified": ""},
+        ),
+        # Reset attrs
+        (
+            HTTPSource(http_etag="et_test", http_last_modified="lm_test"),
+            {"header1": "firt", "header2": "second"},
+            {"etag": "", "last_modified": ""},
+        ),
+        # Set attrs
+        (
+            HTTPSource(http_etag="et_test", http_last_modified="lm_test"),
+            {"ETag": "test123", "Last-Modified": "abc123", "some": "header"},
+            {"etag": "test123", "last_modified": "abc123"},
+        ),
+    ],
+)
+def test_http_source_extract_caching_headers(source, input_data, expected):
+    """Test that HTTPSource.extract_caching_headers() works as expected."""
+    source.extract_caching_headers(input_data)
+    assert source.http_etag == expected["etag"]
+    assert source.http_last_modified == expected["last_modified"]
+
+
+@pytest.mark.parametrize(
+    "source,expected",
+    [
+        (
+            HTTPSource(),
+            {},
+        ),
+        (
+            HTTPSource(http_etag="et_test"),
+            {"if-none-match": "et_test"},
+        ),
+        (
+            HTTPSource(http_last_modified="lm_test"),
+            {"if-modified-since": "lm_test"},
+        ),
+        (
+            HTTPSource(http_etag="et_test", http_last_modified="lm_test"),
+            {"if-modified-since": "lm_test", "if-none-match": "et_test"},
+        ),
+    ],
+)
+def test_http_source_make_caching_headers(source, expected):
+    """Test that HTTPSource.make_caching_headers() works as expected."""
+    result = source.make_caching_headers()
+    assert result == expected
diff --git a/migrations/convert_cache_to_dataclass_v1.py b/migrations/convert_cache_to_dataclass_v1.py
@@ -15,13 +15,14 @@
 import sys
 from importlib.machinery import SourceFileLoader
 
-# NOTICE: An ugly hack in order to be able to import CachedData class from
-# rss2irc. I'm real sorry about this, son.
+# NOTICE: An ugly hack in order to be able to import CachedData class.
+# I'm real sorry about this, son.
 # NOTE: Sadly, importlib.util and spec didn't cut it. Also, I'm out of time on
 # this. Therefore, see you again in the future once this ceases to work.
 SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
-rss2irc_module_path = os.path.join(SCRIPT_PATH, "..", "rss2irc.py")
-rss2irc = SourceFileLoader("rss2irc", rss2irc_module_path).load_module()
+lib_module_path = os.path.join(SCRIPT_PATH, "..", "lib", "__init__.py")
+lib = SourceFileLoader("lib", lib_module_path).load_module()
+CachedData = lib.cached_data.CachedData
 
 
 def main():
@@ -50,7 +51,7 @@ def main():
     logger.info("Create backup file '%s' from '%s'.", bak_file, args.cache)
     shutil.copy2(args.cache, bak_file)
 
-    new_cache = rss2irc.CachedData()
+    new_cache = CachedData()
     for key, value in cache.items():
         new_cache.items[key] = value
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,4 @@ @@
     set -e
     set -u
-    reorder-python-imports `find . ! -path '*/\.*' -name '*.py'`
+    reorder-python-imports --py311-plus `find . ! -path '*/\.*' -name '*.py'`