docs: describe usage for different middlewares and extension

settings: add prefix for namespaced settings
q-m · Oct 14, 2024 · 8a58867 · 8a58867
1 parent 8af1209
commit 8a58867
Show file tree

Hide file tree

Showing 9 changed files with 104 additions and 31 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -7,4 +7,7 @@
 - Crawl against WACZ format archives.
 - Integrate seamlessly with Scrapy’s spider request and response cycle.
 
+## Limitations
+- WACZ supports saving images but this module does not yet integrate with Scrapy's image/file pipeline for retrieving images/files from the WACZ. Future support for this feature is planned.
+
 **Source Code**: <a href="https://github.com/q-m/scrapy-webarchive" target="_blank">https://github.com/q-m/scrapy-webarchive</a>
diff --git a/docs/settings.md b/docs/settings.md
@@ -1,37 +1,45 @@
 # Settings
 
-`scrapy-webarchive` makes use of the following settings, in addition to Scrapy's settings:
+`scrapy-webarchive` makes use of the following settings, in addition to Scrapy's settings. Note that all the settings are prefixed with `SW_`.
 
 ## Extensions
 
-### `ARCHIVE_EXPORT_URI`
+### `SW_EXPORT_URI`
 
 ```python
-ARCHIVE_EXPORT_URI = "s3://scrapy-webarchive/"
-ARCHIVE_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/"
+SW_EXPORT_URI = "s3://scrapy-webarchive/"
+SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/"
 ```
 
 This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path. 
 
 Supported variables: `year`, `month`, `day` and `timestamp`.
 
-## Downloader middleware
+## Downloader middleware and spider middleware
 
-### `WACZ_SOURCE_URL`
+### `SW_WACZ_SOURCE_URL`
 
 ```python
-WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
 
 # Allows multiple sources, comma seperated.
-WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
+SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
 ```
 
 This setting defines the location of the WACZ file that should be used as a source for the crawl job.
 
-### `WACZ_CRAWL`
+### `SW_WACZ_CRAWL`
 
 ```python
-WACZ_CRAWL = True
+SW_WACZ_CRAWL = True
 ```
 
 Setting to ignore original `start_requests`, just yield all responses found.
+
+### `SW_WACZ_TIMEOUT`
+
+```python
+SW_WACZ_TIMEOUT = 60
+```
+
+Transport parameter for retrieving the `SW_WACZ_SOURCE_URL` from the defined location.
diff --git a/docs/usage.md b/docs/usage.md
@@ -0,0 +1,62 @@
+# Usage
+
+## Exporting
+
+### Exporting a WACZ archive
+
+To archive the requests/responses during a crawl job you need to enable the `WaczExporter` extension. 
+
+```python
+EXTENSIONS = {
+    "scrapy_webarchive.extensions.WaczExporter": 543,
+}
+```
+
+This extension also requires you to set the export location using the `SW_EXPORT_URI` settings.
+
+```python
+SW_EXPORT_URI = "s3://scrapy-webarchive/"
+```
+
+Running a crawl job using these settings will result in a newly created WACZ file.
+
+## Crawling
+
+There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below. Using both strategies at the same time is not allowed.
+
+## Lookup in a WACZ archive
+
+One of the ways to crawl against a WACZ archive is to use the `WaczMiddleware` downloader middleware. Instead of fetching the live resource the middleware will instead retrieve it from the archive and recreate a response using the data from the archive.
+
+To use the downloader middleware, enable it in the settings like so:
+
+```python
+DOWNLOADER_MIDDLEWARES = {
+    "scrapy_webarchive.downloadermiddlewares.WaczMiddleware": 543,
+}
+```
+
+Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
+
+```python
+SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+```
+
+## Iterating a WACZ archive
+
+Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive.
+
+To use the spider middleware, enable it in the settings like so:
+
+```python
+SPIDER_MIDDLEWARES = {
+    "scrapy_webarchive.middleware.WaczCrawlMiddleware": 532,
+}
+```
+
+Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting:
+
+```python
+SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_CRAWL = True
+```
diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py
@@ -25,14 +25,14 @@ class WaczMiddleware:
 
     def __init__(self, settings: Settings, stats: StatsCollector) -> None:
         self.stats = stats
-        wacz_url = settings.get("WACZ_SOURCE_URL", None)
+        wacz_url = settings.get("SW_WACZ_SOURCE_URL", None)
 
         if not wacz_url:
             raise NotConfigured
 
         self.wacz_urls = re.split(r"\s*,\s*", wacz_url)
-        self.crawl = settings.get("WACZ_CRAWL", False)
-        self.timeout = settings.getfloat("WACZ_TIMEOUT", 60)
+        self.crawl = settings.get("SW_WACZ_CRAWL", False)
+        self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60)
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Self:

diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -31,14 +31,14 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:
         self.settings = settings
         self.stats = crawler.stats
 
-        if not self.settings["ARCHIVE_EXPORT_URI"]:
+        if not self.settings["SW_EXPORT_URI"]:
             raise NotConfigured
 
         self.store = self._get_store()
         self.writer = WarcFileWriter(collection_name=crawler.spider.name)
 
     def _get_store(self):
-        archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"]
+        archive_uri_template = self.settings["SW_EXPORT_URI"]
         uri = archive_uri_template.format(**get_archive_uri_template_variables())
 
         if Path(uri).is_absolute():  # to support win32 paths like: C:\\some\dir

diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py
@@ -16,14 +16,14 @@
 class WaczCrawlMiddleware:
     def __init__(self, settings: Settings, stats: StatsCollector) -> None:
         self.stats = stats
-        wacz_url = settings.get("WACZ_SOURCE_URL", None)
+        wacz_url = settings.get("SW_WACZ_SOURCE_URL", None)
 
         if not wacz_url:
             raise NotConfigured
 
         self.wacz_urls = re.split(r"\s*,\s*", wacz_url)
-        self.crawl = settings.get("WACZ_CRAWL", False)
-        self.timeout = settings.getfloat("WACZ_TIMEOUT", 60)
+        self.crawl = settings.get("SW_WACZ_CRAWL", False)
+        self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60)
 
     @classmethod
     def from_crawler(cls, crawler: Crawler) -> Self:

diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py
@@ -17,9 +17,9 @@ def setup_method(self):
 
     def _get_settings(self, **new_settings):
         settings = {
-            "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
-            "WACZ_CRAWL": False,
-            "WACZ_TIMEOUT": 60,
+            "SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
+            "SW_WACZ_CRAWL": False,
+            "SW_WACZ_TIMEOUT": 60,
         }
         settings.update(new_settings)
         return Settings(settings)

diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -19,22 +19,22 @@ def test_archive_export_uri_invalid_raises_not_configured(self):
     @mock.patch('scrapy_webarchive.extensions.FTPFilesStore.__init__', return_value=None)
     @mock.patch('scrapy_webarchive.extensions.FSFilesStore.__init__', return_value=None)
     def test_get_store(self, *args):
-        crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"})
+        crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"})
         crawler.spider = crawler._create_spider("quotes")
         extension = WaczExporter.from_crawler(crawler)
         assert isinstance(extension.store, FSFilesStore)
 
-        crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "s3://scrapy-webarchive/wacz/"})
+        crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "s3://scrapy-webarchive/wacz/"})
         crawler.spider = crawler._create_spider("quotes")
         extension = WaczExporter.from_crawler(crawler)
         assert isinstance(extension.store, S3FilesStore)
 
-        crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "gs://scrapy-webarchive/wacz/"})
+        crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "gs://scrapy-webarchive/wacz/"})
         crawler.spider = crawler._create_spider("quotes")
         extension = WaczExporter.from_crawler(crawler)
         assert isinstance(extension.store, GCSFilesStore)
 
-        crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"})
+        crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"})
         crawler.spider = crawler._create_spider("quotes")
         extension = WaczExporter.from_crawler(crawler)
         assert isinstance(extension.store, FTPFilesStore)
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
@@ -16,8 +16,8 @@ def setup_method(self):
 
     def _get_settings(self, **new_settings):
         settings = {
-            "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
-            "WACZ_TIMEOUT": 60,
+            "SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
+            "SW_WACZ_TIMEOUT": 60,
         }
         settings.update(new_settings)
         return Settings(settings)
@@ -32,25 +32,25 @@ def _middleware(self, **new_settings):
     def test_wacz_archive_is_ignored_follow_original_behaviour(self):
         request = Request("https://quotes.toscrape.com")
 
-        with self._middleware(WACZ_CRAWL=False) as mw:
+        with self._middleware(SW_WACZ_CRAWL=False) as mw:
             out = list(mw.process_start_requests([request], self.spider))
             assert out == [request]
 
     def test_wacz_archive_iterates_all_records(self):
-        with self._middleware(WACZ_CRAWL=True) as mw:
+        with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
             assert len(out) == 101
 
     def test_wacz_archive_filters_allowed_domains(self):
         setattr(self.spider, "allowed_domains", "quotes.toscrape.com")
 
-        with self._middleware(WACZ_CRAWL=True) as mw:
+        with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
             assert len(out) == 61
 
     def test_wacz_archive_filters_archive_regex(self):
         setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/")
 
-        with self._middleware(WACZ_CRAWL=True) as mw:
+        with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
             assert len(out) == 9