Skip to content

Commit

Permalink
Allow setting the WACZ output filename in the EXPORT_URI setting (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Nov 1, 2024
1 parent e044c9b commit a570295
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 28 deletions.
6 changes: 6 additions & 0 deletions docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
### `SW_EXPORT_URI`

```python
# Either configure the directory where the output should be uploaded to
SW_EXPORT_URI = "s3://scrapy-webarchive/"
SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/"
SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}/"

# OR add the file name for full control of the output
SW_EXPORT_URI = "s3://scrapy-webarchive/output.wacz"
SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/output-{timestamp}.wacz"
SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}-{timestamp}.wacz"
```

This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path.
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ exclude = [
"venv",
]

line-length = 119
line-length = 120
indent-width = 4

# Assume Python 3.8
target-version = "py38"

[tool.ruff.lint]
select = ["E4", "E7", "E9", "F", "I"]
select = ["E4", "E7", "E9", "E501", "F", "I"]
ignore = []
fixable = ["ALL"]
unfixable = []
Expand Down
58 changes: 41 additions & 17 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import annotations

import os
from datetime import datetime
from io import BytesIO
from typing import Tuple

from scrapy import Spider, signals
from scrapy.crawler import Crawler
Expand Down Expand Up @@ -39,6 +41,7 @@ def persist_file(
class WaczExporter:
"""WACZ exporter extension that writes spider requests/responses as WARC and later compiles them to a WACZ."""

wacz_fname = None
STORE_SCHEMES: Dict[str, Type[FilesStoreProtocol]] = {
"": FSFilesStore,
"file": FSFilesStore,
Expand All @@ -50,27 +53,47 @@ class WaczExporter:
def __init__(self, settings: Settings, crawler: Crawler) -> None:
self.settings = settings
self.stats = crawler.stats
self.crawler = crawler

if not self.settings["SW_EXPORT_URI"]:
raise NotConfigured

if "scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware" in settings.getlist('SPIDER_MIDDLEWARES'):
raise NotConfigured("You must disable the WaczCrawlMiddleware before you can use this extension.")
# Check configuration prerequisites
self._check_configuration_prerequisites()

if "scrapy_webarchive.downloadermiddlewares.WaczMiddleware" in settings.getlist('DOWNLOADER_MIDDLEWARES'):
raise NotConfigured("You must disable the WaczMiddleware before you can use this extension.")
# Get the store URI and configure the WACZ filename
store_uri, self.wacz_fname = self._retrieve_store_uri_and_wacz_fname()

self.store: FilesStoreProtocol = self._get_store(spider_name=crawler.spider.name)
# Initialize store and writer
self.store: FilesStoreProtocol = self._get_store(store_uri)
self.writer = WarcFileWriter(collection_name=crawler.spider.name)

def _get_store(self, spider_name: str) -> FilesStoreProtocol:
archive_uri_template = self.settings["SW_EXPORT_URI"]
uri = archive_uri_template.format(**{
"spider": spider_name,
def _check_configuration_prerequisites(self) -> None:
"""raises NotConfigured if essential settings or middleware configurations are incorrect."""

if not self.settings.get("SW_EXPORT_URI"):
raise NotConfigured("Missing SW_EXPORT_URI setting.")

forbidden_middleware = [
("scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware", "SPIDER_MIDDLEWARES"),
("scrapy_webarchive.downloadermiddlewares.WaczMiddleware", "DOWNLOADER_MIDDLEWARES"),
]
if any(middleware in self.settings.getlist(key) for middleware, key in forbidden_middleware):
raise NotConfigured("Disable WACZ middlewares in SPIDER_MIDDLEWARES and DOWNLOADER_MIDDLEWARES.")

def _retrieve_store_uri_and_wacz_fname(self) -> Tuple[str, Union[str, None]]:
"""Sets up the export URI based on configuration and spider context."""

export_uri = self.settings["SW_EXPORT_URI"].format(
spider=self.crawler.spider.name,
**get_archive_uri_template_dt_variables(),
})
store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)]
return store_cls(uri)
)

if os.path.isdir(export_uri):
return export_uri, None
else:
return os.path.split(export_uri)

def _get_store(self, store_uri: str) -> FilesStoreProtocol:
store_cls = self.STORE_SCHEMES[get_scheme_from_uri(store_uri)]
return store_cls(store_uri)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
Expand Down Expand Up @@ -134,8 +157,9 @@ def response_received(self, response: Response, request: Request, spider: Spider

def spider_closed(self, spider: Spider) -> None:
wacz_creator = WaczFileCreator(
store=self.store,
warc_fname=self.writer.warc_fname,
store=self.store,
warc_fname=self.writer.warc_fname,
wacz_fname=self.wacz_fname,
collection_name=spider.name,
title=self.settings["SW_WACZ_TITLE"],
description=self.settings["SW_WACZ_DESCRIPTION"],
Expand Down
6 changes: 4 additions & 2 deletions scrapy_webarchive/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def spider_opened(self, spider: Spider) -> None:
process, and collects valid WACZ files for further use.
If only one WACZ URI is provided, it opens and assigns the file to `self.wacz` as a `WaczFile` instance.
If multiple URIs are provided, valid files are grouped and assigned to `self.wacz` as a `MultiWaczFile` instance.
If multiple URIs are provided, valid files are assigned to `self.wacz` as a `MultiWaczFile` instance.
"""

spider.logger.info(f"[WACZDownloader] Found {len(self.wacz_uris)} WACZ URI(s) to open")
Expand All @@ -62,7 +62,9 @@ def spider_opened(self, spider: Spider) -> None:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")

if wacz_files:
spider.logger.info(f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files")
spider.logger.info(
f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files"
)
if len(wacz_files) == 1:
self.wacz = WaczFile(wacz_files[0])
else:
Expand Down
25 changes: 19 additions & 6 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,23 @@ class WaczFileCreator:
hash_type = "sha256"
datapackage_fname = "datapackage.json"

def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None:
def __init__(
self,
store: 'FilesStoreProtocol',
warc_fname: str,
collection_name: str,
title: str,
description: str,
wacz_fname: Union[str, None],
cdxj_fname: str = "index.cdxj",
) -> None:
self.store = store
self.warc_fname = warc_fname
self.cdxj_fname = cdxj_fname
self.collection_name = collection_name
self._title = title
self._description = description
self.wacz_fname = wacz_fname or self.get_wacz_fname()

def create(self) -> None:
"""Create the WACZ file from the WARC and CDXJ index and save it in the configured store."""
Expand All @@ -63,7 +73,7 @@ def create(self) -> None:

# Save WACZ to the storage
zip_buffer.seek(0)
self.store.persist_file(path=self.get_wacz_fname(), buf=zip_buffer, info=None)
self.store.persist_file(path=self.wacz_fname, buf=zip_buffer, info=None)

def create_wacz_zip(self) -> io.BytesIO:
"""Create the WACZ zip file and return the in-memory buffer."""
Expand Down Expand Up @@ -160,12 +170,15 @@ def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]:
@property
def title(self):
return self._title or self.collection_name

@property
def description(self):
return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \
f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \
"any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript."
return (
self._description
or f"This is the web archive generated by a scrapy-webarchive extension for the {self.collection_name} "
"spider. It is mainly for scraping purposes as it does not contain any js/css data. Though it can be "
"replayed as bare HTML if the site does not depend on JavaScript."
)


class WaczFile:
Expand Down
1 change: 1 addition & 0 deletions tests/test_wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def wacz_file_creator(self):
cdxj_fname=self.cdxj_fname,
title="Testing",
description="WACZ generated durning a unit-test",
wacz_fname=None,
)

@freeze_time("2024-10-04 08:27:11")
Expand Down
6 changes: 5 additions & 1 deletion tests/test_warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ def test_generate_warc_fname(monkeypatch):

@pytest.fixture
def warc_record_response():
payload = b"""HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>"""
payload = (
b"HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; "
b"charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n"
b"<html lang=\"en\">Welcome to scrapy-webarchive!</html>"
)
return WARCRecord(payload=payload, headers={"WARC-Target-URI": "http://example.com"})


Expand Down

0 comments on commit a570295

Please sign in to comment.