Allow setting the WACZ output filename in the EXPORT_URI setting (#18)

q-m · Nov 1, 2024 · a570295 · a570295
1 parent e044c9b
commit a570295
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 28 deletions.
diff --git a/docs/settings.md b/docs/settings.md
@@ -7,9 +7,15 @@
 ### `SW_EXPORT_URI`
 
 ```python
+# Either configure the directory where the output should be uploaded to
 SW_EXPORT_URI = "s3://scrapy-webarchive/"
 SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/"
 SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}/"
+
+# OR add the file name for full control of the output
+SW_EXPORT_URI = "s3://scrapy-webarchive/output.wacz"
+SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/output-{timestamp}.wacz"
+SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}-{timestamp}.wacz"
 ```
 
 This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path. 

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,14 +57,14 @@ exclude = [
     "venv",
 ]
 
-line-length = 119
+line-length = 120
 indent-width = 4
 
 # Assume Python 3.8
 target-version = "py38"
 
 [tool.ruff.lint]
-select = ["E4", "E7", "E9", "F", "I"]
+select = ["E4", "E7", "E9", "E501", "F", "I"]
 ignore = []
 fixable = ["ALL"]
 unfixable = []

diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 
+import os
 from datetime import datetime
 from io import BytesIO
+from typing import Tuple
 
 from scrapy import Spider, signals
 from scrapy.crawler import Crawler
@@ -39,6 +41,7 @@ def persist_file(
 class WaczExporter:
     """WACZ exporter extension that writes spider requests/responses as WARC and later compiles them to a WACZ."""
 
+    wacz_fname = None
     STORE_SCHEMES: Dict[str, Type[FilesStoreProtocol]] = {
         "": FSFilesStore,
         "file": FSFilesStore,
@@ -50,27 +53,47 @@ class WaczExporter:
     def __init__(self, settings: Settings, crawler: Crawler) -> None:
         self.settings = settings
         self.stats = crawler.stats
+        self.crawler = crawler
 
-        if not self.settings["SW_EXPORT_URI"]:
-            raise NotConfigured
-
-        if "scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware" in settings.getlist('SPIDER_MIDDLEWARES'):
-            raise NotConfigured("You must disable the WaczCrawlMiddleware before you can use this extension.")
+        # Check configuration prerequisites
+        self._check_configuration_prerequisites()
 
-        if "scrapy_webarchive.downloadermiddlewares.WaczMiddleware" in settings.getlist('DOWNLOADER_MIDDLEWARES'):
-            raise NotConfigured("You must disable the WaczMiddleware before you can use this extension.")
+        # Get the store URI and configure the WACZ filename
+        store_uri, self.wacz_fname  = self._retrieve_store_uri_and_wacz_fname()
 
-        self.store: FilesStoreProtocol = self._get_store(spider_name=crawler.spider.name)
+        # Initialize store and writer
+        self.store: FilesStoreProtocol = self._get_store(store_uri)
         self.writer = WarcFileWriter(collection_name=crawler.spider.name)
 
-    def _get_store(self, spider_name: str) -> FilesStoreProtocol:
-        archive_uri_template = self.settings["SW_EXPORT_URI"]
-        uri = archive_uri_template.format(**{
-            "spider": spider_name,
+    def _check_configuration_prerequisites(self) -> None:
+        """raises NotConfigured if essential settings or middleware configurations are incorrect."""
+
+        if not self.settings.get("SW_EXPORT_URI"):
+            raise NotConfigured("Missing SW_EXPORT_URI setting.")
+
+        forbidden_middleware = [
+            ("scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware", "SPIDER_MIDDLEWARES"),
+            ("scrapy_webarchive.downloadermiddlewares.WaczMiddleware", "DOWNLOADER_MIDDLEWARES"),
+        ]
+        if any(middleware in self.settings.getlist(key) for middleware, key in forbidden_middleware):
+            raise NotConfigured("Disable WACZ middlewares in SPIDER_MIDDLEWARES and DOWNLOADER_MIDDLEWARES.")
+
+    def _retrieve_store_uri_and_wacz_fname(self) -> Tuple[str, Union[str, None]]:
+        """Sets up the export URI based on configuration and spider context."""
+
+        export_uri = self.settings["SW_EXPORT_URI"].format(
+            spider=self.crawler.spider.name,
             **get_archive_uri_template_dt_variables(),
-        })
-        store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)]
-        return store_cls(uri)
+        )
+
+        if os.path.isdir(export_uri):
+            return export_uri, None
+        else:
+            return os.path.split(export_uri)
+
+    def _get_store(self, store_uri: str) -> FilesStoreProtocol:
+        store_cls = self.STORE_SCHEMES[get_scheme_from_uri(store_uri)]
+        return store_cls(store_uri)
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Self:
@@ -134,8 +157,9 @@ def response_received(self, response: Response, request: Request, spider: Spider
 
     def spider_closed(self, spider: Spider) -> None:
         wacz_creator = WaczFileCreator(
-            store=self.store, 
-            warc_fname=self.writer.warc_fname, 
+            store=self.store,
+            warc_fname=self.writer.warc_fname,
+            wacz_fname=self.wacz_fname,
             collection_name=spider.name,
             title=self.settings["SW_WACZ_TITLE"],
             description=self.settings["SW_WACZ_DESCRIPTION"],

diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py
@@ -47,7 +47,7 @@ def spider_opened(self, spider: Spider) -> None:
         process, and collects valid WACZ files for further use.
 
         If only one WACZ URI is provided, it opens and assigns the file to `self.wacz` as a `WaczFile` instance. 
-        If multiple URIs are provided, valid files are grouped and assigned to `self.wacz` as a `MultiWaczFile` instance. 
+        If multiple URIs are provided, valid files are assigned to `self.wacz` as a `MultiWaczFile` instance.
         """
 
         spider.logger.info(f"[WACZDownloader] Found {len(self.wacz_uris)} WACZ URI(s) to open")
@@ -62,7 +62,9 @@ def spider_opened(self, spider: Spider) -> None:
                 spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
 
         if wacz_files:
-            spider.logger.info(f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files")
+            spider.logger.info(
+                f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files"
+            )
             if len(wacz_files) == 1:
                 self.wacz = WaczFile(wacz_files[0])
             else:

diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py
@@ -41,13 +41,23 @@ class WaczFileCreator:
     hash_type = "sha256"
     datapackage_fname = "datapackage.json"
 
-    def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None:
+    def __init__(
+        self, 
+        store: 'FilesStoreProtocol', 
+        warc_fname: str, 
+        collection_name: str, 
+        title: str, 
+        description: str, 
+        wacz_fname: Union[str, None], 
+        cdxj_fname: str = "index.cdxj",
+    ) -> None:
         self.store = store
         self.warc_fname = warc_fname
         self.cdxj_fname = cdxj_fname
         self.collection_name = collection_name
         self._title = title
         self._description = description
+        self.wacz_fname = wacz_fname or self.get_wacz_fname()
 
     def create(self) -> None:
         """Create the WACZ file from the WARC and CDXJ index and save it in the configured store."""
@@ -63,7 +73,7 @@ def create(self) -> None:
 
         # Save WACZ to the storage
         zip_buffer.seek(0)
-        self.store.persist_file(path=self.get_wacz_fname(), buf=zip_buffer, info=None)
+        self.store.persist_file(path=self.wacz_fname, buf=zip_buffer, info=None)
 
     def create_wacz_zip(self) -> io.BytesIO:
         """Create the WACZ zip file and return the in-memory buffer."""
@@ -160,12 +170,15 @@ def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]:
     @property
     def title(self):
         return self._title or self.collection_name
-    
+
     @property
     def description(self):
-        return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \
-                        f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \
-                        "any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript."
+        return (
+            self._description
+            or f"This is the web archive generated by a scrapy-webarchive extension for the {self.collection_name} "
+            "spider. It is mainly for scraping purposes as it does not contain any js/css data. Though it can be "
+            "replayed as bare HTML if the site does not depend on JavaScript."
+        )
 
 
 class WaczFile:

diff --git a/tests/test_wacz.py b/tests/test_wacz.py
@@ -30,6 +30,7 @@ def wacz_file_creator(self):
             cdxj_fname=self.cdxj_fname,
             title="Testing",
             description="WACZ generated durning a unit-test",
+            wacz_fname=None,
         )
 
     @freeze_time("2024-10-04 08:27:11")

diff --git a/tests/test_warc.py b/tests/test_warc.py
@@ -24,7 +24,11 @@ def test_generate_warc_fname(monkeypatch):
 
 @pytest.fixture
 def warc_record_response():
-    payload = b"""HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>"""
+    payload = (
+        b"HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; "
+        b"charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n"
+        b"<html lang=\"en\">Welcome to scrapy-webarchive!</html>"
+    )
     return WARCRecord(payload=payload, headers={"WARC-Target-URI": "http://example.com"})