Skip to content

Commit

Permalink
Update usage docs and synchronize spider and downloader middleware
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 18, 2024
1 parent bea52d1 commit 44889fe
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 46 deletions.
13 changes: 9 additions & 4 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Running a crawl job using these settings will result in a newly created WACZ fil

## Crawling

There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below. Using both strategies at the same time is not allowed.
There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below.

### Lookup in a WACZ archive

Expand All @@ -40,17 +40,22 @@ Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URI` setting:

```python
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
SW_WACZ_CRAWL = True
```

### Iterating a WACZ archive

Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive.
Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive index. Then, similar to the previous strategy, it will recreate a response using the data from the archive.

To use the spider middleware, enable it in the settings like so:
To use this strategy, enable both middlewares in the spider settings like so:

```python
DOWNLOADER_MIDDLEWARES = {
"scrapy_webarchive.downloadermiddlewares.WaczMiddleware": 543,
}

SPIDER_MIDDLEWARES = {
"scrapy_webarchive.middleware.WaczCrawlMiddleware": 532,
"scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware": 543,
}
```

Expand Down
31 changes: 20 additions & 11 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re

from scrapy.exceptions import IgnoreRequest
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.spiders import Spider

from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.spidermiddlewares import BaseWaczMiddleware
from scrapy_webarchive.warc import record_transformer

Expand All @@ -18,21 +18,30 @@ class WaczMiddleware(BaseWaczMiddleware):
"""

def process_request(self, request: Request, spider: Spider):
if not hasattr(self, 'wacz'):
self.stats.set_value("webarchive/no_valid_sources", True, spider=spider)
raise IgnoreRequest()
# Continue default crawl behaviour
if not self.crawl:
return None

# ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
if hasattr(spider, "archive_blacklist_regexp") and re.search(spider.archive_blacklist_regexp, request.url):
self.stats.inc_value("webarchive/request_blacklisted", spider=spider)
raise IgnoreRequest()
# If the attribute has not been set, none of the WACZ could be opened.
if self.crawl and not hasattr(self, 'wacz'):
raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")

# ignore when crawling and flag indicates this request needs to be skipped during wacz crawl
if self.crawl and "wacz_crawl_skip" in request.flags:
# Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl
if "wacz_crawl_skip" in request.flags:
self.stats.inc_value("webarchive/crawl_skip", spider=spider)
raise IgnoreRequest()

# get record from existing index entry, or else lookup by URL
# Filter out off-site requests
if self._is_off_site(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
raise IgnoreRequest()

# Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
if self._is_disallowed_by_spider(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
raise IgnoreRequest()

# Get record from existing index entry, or else lookup by URL
if request.meta.get("cdxj_record"):
warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])
else:
Expand Down
69 changes: 43 additions & 26 deletions scrapy_webarchive/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scrapy.statscollectors import StatsCollector
from typing_extensions import Iterable, Self

from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
from scrapy_webarchive.warc import record_transformer

Expand Down Expand Up @@ -67,6 +68,17 @@ def spider_opened(self, spider: Spider) -> None:
self.wacz = MultiWaczFile(wacz_files)


def _is_off_site(self, url: str, spider: Spider) -> bool:
"""Check if the URL is off-site based on allowed domains."""

return hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains

def _is_disallowed_by_spider(self, url: str, spider: Spider) -> bool:
"""Check if the URL is disallowed by the spider's archive rules."""

return hasattr(spider, "archive_disallow_regexp") and not re.search(spider.archive_disallow_regexp, url)


class WaczCrawlMiddleware(BaseWaczMiddleware):
"""
Scrapy WACZ crawl spider middleware to crawl from a WACZ archive.
Expand All @@ -82,30 +94,35 @@ def spider_opened(self, spider: Spider) -> None:
super().spider_opened(spider)

def process_start_requests(self, start_requests: Iterable[Request], spider: Spider) -> Iterable[Request]:
if not self.crawl or not hasattr(self, 'wacz'):
for request in start_requests:
yield request

# Ignore original start requests, just yield all responses found
else:
for entry in self.wacz.iter_index():
url = entry.data["url"]

# filter out off-site responses
if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains:
continue

# only accept allowed responses if requested by spider
if hasattr(spider, "archive_regex") and not re.search(spider.archive_regex, url):
continue

"""Processes start requests and yields WACZ index entries or original requests based on the crawl setting."""

# If crawl is disabled, yield the original start requests.
if not self.crawl:
yield from start_requests
return
# If the attribute has not been set, none of the WACZ could be opened.
elif not hasattr(self, "wacz"):
raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")

# Iterate over entries in the WACZ index.
for entry in self.wacz.iter_index():
url = entry.data["url"]

# Filter out off-site requests
if self._is_off_site(url, spider):
self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
flags = ["wacz_start_request", "wacz_crawl_skip"]
# Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
elif self._is_disallowed_by_spider(url, spider):
self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
flags = ["wacz_start_request", "wacz_crawl_skip"]
else:
self.stats.inc_value("webarchive/start_request_count", spider=spider)

# do not filter to allow all occurences to be handled
# since we don't yet get all information for the request, this can be necessary
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"cdxj_record": entry},
dont_filter=True,
)
flags = ["wacz_start_request"]

yield record_transformer.request_for_record(
entry,
flags=flags,
meta={"cdxj_record": entry},
dont_filter=True,
)
2 changes: 1 addition & 1 deletion tests/test_downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _get_wacz_source_url(self) -> str:
def _get_settings(self, **new_settings):
settings = {
"SW_WACZ_SOURCE_URI": self._get_wacz_source_url(),
"SW_WACZ_CRAWL": False,
"SW_WACZ_CRAWL": True,
"SW_WACZ_TIMEOUT": 60,
}
settings.update(new_settings)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,4 @@ def test_response_received(self):

# Verify that the stats were incremented correctly
assert extension.stats._stats['webarchive/exporter/response_written'] == 1
assert extension.stats._stats['webarchive/exporter/request_written'] == 1
assert extension.stats._stats['webarchive/exporter/request_written'] == 1
6 changes: 3 additions & 3 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ def test_wacz_archive_filters_allowed_domains(self):

with self._middleware(SW_WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 61
assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 61

def test_wacz_archive_filters_archive_regex(self):
setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/")
setattr(self.spider, "archive_disallow_regexp", r"https://quotes\.toscrape\.com/page/\d+/")

with self._middleware(SW_WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 9
assert len([request for request in out if "wacz_crawl_skip" not in request.flags]) == 9

0 comments on commit 44889fe

Please sign in to comment.