Update usage docs and synchronize spider and downloader middleware

q-m · Oct 18, 2024 · 44889fe · 44889fe
1 parent bea52d1
commit 44889fe
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 46 deletions.
diff --git a/docs/usage.md b/docs/usage.md
@@ -22,7 +22,7 @@ Running a crawl job using these settings will result in a newly created WACZ fil
 
 ## Crawling
 
-There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below. Using both strategies at the same time is not allowed.
+There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below.
 
 ### Lookup in a WACZ archive
 
@@ -40,17 +40,22 @@ Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:
 
 ```python
 SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_CRAWL = True
 ```
 
 ### Iterating a WACZ archive
 
-Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive.
+Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive index. Then, similar to the previous strategy, it will recreate a response using the data from the archive.
 
-To use the spider middleware, enable it in the settings like so:
+To use this strategy, enable both middlewares in the spider settings like so:
 
 ```python
+DOWNLOADER_MIDDLEWARES = {
+    "scrapy_webarchive.downloadermiddlewares.WaczMiddleware": 543,
+}
+
 SPIDER_MIDDLEWARES = {
-    "scrapy_webarchive.middleware.WaczCrawlMiddleware": 532,
+    "scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware": 543,
 }
 ```
 

diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py
@@ -1,10 +1,10 @@
-import re
 
 from scrapy.exceptions import IgnoreRequest
 from scrapy.http.request import Request
 from scrapy.http.response import Response
 from scrapy.spiders import Spider
 
+from scrapy_webarchive.exceptions import WaczMiddlewareException
 from scrapy_webarchive.spidermiddlewares import BaseWaczMiddleware
 from scrapy_webarchive.warc import record_transformer
 
@@ -18,21 +18,30 @@ class WaczMiddleware(BaseWaczMiddleware):
     """
 
     def process_request(self, request: Request, spider: Spider):
-        if not hasattr(self, 'wacz'):
-            self.stats.set_value("webarchive/no_valid_sources", True, spider=spider)
-            raise IgnoreRequest()
+        # Continue default crawl behaviour
+        if not self.crawl:
+            return None
 
-        # ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
-        if hasattr(spider, "archive_blacklist_regexp") and re.search(spider.archive_blacklist_regexp, request.url):
-            self.stats.inc_value("webarchive/request_blacklisted", spider=spider)
-            raise IgnoreRequest()
+        # If the attribute has not been set, none of the WACZ could be opened.
+        if self.crawl and not hasattr(self, 'wacz'):
+            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
 
-        # ignore when crawling and flag indicates this request needs to be skipped during wacz crawl
-        if self.crawl and "wacz_crawl_skip" in request.flags:
+        # Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl
+        if "wacz_crawl_skip" in request.flags:
             self.stats.inc_value("webarchive/crawl_skip", spider=spider)
             raise IgnoreRequest()
 
-        # get record from existing index entry, or else lookup by URL
+        # Filter out off-site requests
+        if self._is_off_site(request.url, spider):
+            self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
+            raise IgnoreRequest()
+
+        # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
+        if self._is_disallowed_by_spider(request.url, spider):
+            self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
+            raise IgnoreRequest()
+
+        # Get record from existing index entry, or else lookup by URL
         if request.meta.get("cdxj_record"):
             warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])
         else:

diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py
@@ -9,6 +9,7 @@
 from scrapy.statscollectors import StatsCollector
 from typing_extensions import Iterable, Self
 
+from scrapy_webarchive.exceptions import WaczMiddlewareException
 from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
 from scrapy_webarchive.warc import record_transformer
 
@@ -67,6 +68,17 @@ def spider_opened(self, spider: Spider) -> None:
                 self.wacz = MultiWaczFile(wacz_files)
 
 
+    def _is_off_site(self, url: str, spider: Spider) -> bool:
+        """Check if the URL is off-site based on allowed domains."""
+
+        return hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains
+
+    def _is_disallowed_by_spider(self, url: str, spider: Spider) -> bool:
+        """Check if the URL is disallowed by the spider's archive rules."""
+
+        return hasattr(spider, "archive_disallow_regexp") and not re.search(spider.archive_disallow_regexp, url)
+
+
 class WaczCrawlMiddleware(BaseWaczMiddleware):
     """
     Scrapy WACZ crawl spider middleware to crawl from a WACZ archive.
@@ -82,30 +94,35 @@ def spider_opened(self, spider: Spider) -> None:
         super().spider_opened(spider)
 
     def process_start_requests(self, start_requests: Iterable[Request], spider: Spider) -> Iterable[Request]:
-        if not self.crawl or not hasattr(self, 'wacz'):
-            for request in start_requests:
-                yield request
-
-        # Ignore original start requests, just yield all responses found
-        else:
-            for entry in self.wacz.iter_index():
-                url = entry.data["url"]
-
-                # filter out off-site responses
-                if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains:
-                    continue
-
-                # only accept allowed responses if requested by spider
-                if hasattr(spider, "archive_regex") and not re.search(spider.archive_regex, url):
-                    continue
-
+        """Processes start requests and yields WACZ index entries or original requests based on the crawl setting."""
+
+        # If crawl is disabled, yield the original start requests.
+        if not self.crawl:
+            yield from start_requests
+            return
+        # If the attribute has not been set, none of the WACZ could be opened.
+        elif not hasattr(self, "wacz"):
+            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
+
+        # Iterate over entries in the WACZ index.
+        for entry in self.wacz.iter_index():
+            url = entry.data["url"]
+
+            # Filter out off-site requests
+            if self._is_off_site(url, spider):
+                self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
+                flags = ["wacz_start_request", "wacz_crawl_skip"]
+            # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
+            elif self._is_disallowed_by_spider(url, spider):
+                self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
+                flags = ["wacz_start_request", "wacz_crawl_skip"]
+            else:
                 self.stats.inc_value("webarchive/start_request_count", spider=spider)
-
-                # do not filter to allow all occurences to be handled
-                # since we don't yet get all information for the request, this can be necessary
-                yield record_transformer.request_for_record(
-                    entry,
-                    flags=["wacz_start_request"],
-                    meta={"cdxj_record": entry},
-                    dont_filter=True,
-                )
+                flags = ["wacz_start_request"]
+
+            yield record_transformer.request_for_record(
+                entry,
+                flags=flags,
+                meta={"cdxj_record": entry},
+                dont_filter=True,
+            )
diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py
@@ -23,7 +23,7 @@ def _get_wacz_source_url(self) -> str:
     def _get_settings(self, **new_settings):
         settings = {
             "SW_WACZ_SOURCE_URI": self._get_wacz_source_url(),
-            "SW_WACZ_CRAWL": False,
+            "SW_WACZ_CRAWL": True,
             "SW_WACZ_TIMEOUT": 60,
         }
         settings.update(new_settings)

diff --git a/tests/test_extensions.py b/tests/test_extensions.py
@@ -60,4 +60,4 @@ def test_response_received(self):
 
         # Verify that the stats were incremented correctly
         assert extension.stats._stats['webarchive/exporter/response_written'] == 1
-        assert extension.stats._stats['webarchive/exporter/request_written'] == 1
+        assert extension.stats._stats['webarchive/exporter/request_written'] == 1
diff --git a/tests/test_middleware.py b/tests/test_middleware.py
@@ -46,11 +46,11 @@ def test_wacz_archive_filters_allowed_domains(self):
 
         with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
-            assert len(out) == 61
+            assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 61
 
     def test_wacz_archive_filters_archive_regex(self):
-        setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/")
+        setattr(self.spider, "archive_disallow_regexp", r"https://quotes\.toscrape\.com/page/\d+/")
 
         with self._middleware(SW_WACZ_CRAWL=True) as mw:
             out = list(mw.process_start_requests([], self.spider))
-            assert len(out) == 9
+            assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 9