diff --git a/docs/settings.md b/docs/settings.md index 7054e40..e24b1e2 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -16,6 +16,19 @@ This is the output path of the WACZ file. Multiple variables can be added that a Supported variables: `spider`, `year`, `month`, `day` and `timestamp`. +### `SW_WACZ_TITLE` + +This setting defines the title of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured. + +### `SW_WACZ_DESCRIPTION` + +This setting defines the description of the WACZ used in the `datapackage.json`, which is generated durning the WACZ creation. It will default to the spider name if it is not configured. Defaults to: + +> This is the web archive generated by a scrapy-webarchive extension for the +> spider. It is mainly for scraping purposes as it does not contain +> any js/css data. Though it can be replayed as bare HTML if the site does not depend on +> JavaScript. + ## Downloader middleware and spider middleware ### `SW_WACZ_SOURCE_URI` diff --git a/scrapy_webarchive/__init__.py b/scrapy_webarchive/__init__.py index e69de29..d5d6330 100644 --- a/scrapy_webarchive/__init__.py +++ b/scrapy_webarchive/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1.dev2" diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index 8decf39..75465b3 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -14,7 +14,7 @@ from twisted.internet.defer import Deferred from typing_extensions import Any, Dict, Protocol, Self, Type, Union, cast -from scrapy_webarchive.utils import get_scheme_from_uri, get_warc_date +from scrapy_webarchive.utils import WARC_DT_FORMAT, get_formatted_dt_string, get_scheme_from_uri from scrapy_webarchive.wacz import WaczFileCreator from scrapy_webarchive.warc import WarcFileWriter @@ -112,7 +112,7 @@ def spider_opened(self) -> None: self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"]) def response_received(self, response: Response, request: Request, spider: Spider) -> None: - request.meta["WARC-Date"] = get_warc_date() + request.meta["WARC-Date"] = get_formatted_dt_string(format=WARC_DT_FORMAT) # Write response WARC record record = self.writer.write_response(response, request) @@ -127,7 +127,14 @@ def response_received(self, response: Response, request: Request, spider: Spider self.stats.inc_value("webarchive/exporter/request_written", spider=spider) def spider_closed(self, spider: Spider) -> None: - WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create() + wacz_creator = WaczFileCreator( + store=self.store, + warc_fname=self.writer.warc_fname, + collection_name=spider.name, + title=self.settings["SW_WACZ_TITLE"], + description=self.settings["SW_WACZ_DESCRIPTION"], + ) + wacz_creator.create() def get_archive_uri_template_dt_variables() -> dict: diff --git a/scrapy_webarchive/utils.py b/scrapy_webarchive/utils.py index 47f3251..0c5cdec 100644 --- a/scrapy_webarchive/utils.py +++ b/scrapy_webarchive/utils.py @@ -1,21 +1,22 @@ from __future__ import annotations +import hashlib +import logging from datetime import datetime, timezone from pathlib import Path +from typing import IO, Tuple from urllib.parse import urlparse, urlunparse from scrapy.settings import Settings WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ" TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S" +BUFF_SIZE = 1024 * 64 +logger = logging.getLogger(__name__) -def get_current_timestamp() -> str: - return datetime.now(timezone.utc).strftime(TIMESTAMP_DT_FORMAT) - - -def get_warc_date() -> str: - return datetime.now(timezone.utc).strftime(WARC_DT_FORMAT) +def get_formatted_dt_string(format: str) -> str: + return datetime.now(timezone.utc).strftime(format) def header_lines_to_dict(lines): @@ -78,3 +79,21 @@ def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str: return urlunparse(updated_uri) return wacz_uri + + +def hash_stream(hash_type: str, stream: IO) -> Tuple[int, str]: + """Hashes the stream with given hash_type hasher.""" + + # At this moment the `hash_type` (or algorithm) that we pass will always be sha256 as it is hardcoded. + # This check is implemented in case any other algorithms will be made available in the future. + if hash_type not in hashlib.algorithms_guaranteed: + raise ValueError(f"Unsupported hash type: {hash_type}") + + hasher = hashlib.new(hash_type) + + size = 0 + for chunk in iter(lambda: stream.read(BUFF_SIZE), b""): + size += len(chunk) + hasher.update(chunk) + + return size, f"{hash_type}:{hasher.hexdigest()}" diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py index 2a2ab4e..055136a 100644 --- a/scrapy_webarchive/wacz.py +++ b/scrapy_webarchive/wacz.py @@ -2,23 +2,30 @@ import gzip import io +import json import os import zipfile from collections import defaultdict from functools import partial +from typing import Any +from scrapy import __version__ as scrapy_version from scrapy.settings import Settings from smart_open import open as smart_open from typing_extensions import IO, TYPE_CHECKING, Dict, Generator, List, Union from warc.warc import WARCRecord +from scrapy_webarchive import __version__ as scrapy_webarchive_version from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index from scrapy_webarchive.utils import ( + TIMESTAMP_DT_FORMAT, + WARC_DT_FORMAT, add_ftp_credentials, - get_current_timestamp, + get_formatted_dt_string, get_gcs_client, get_s3_client, get_scheme_from_uri, + hash_stream, ) from scrapy_webarchive.warc import WARCReader @@ -26,14 +33,21 @@ from scrapy_webarchive.extensions import FilesStoreProtocol +WACZ_VERSION = "1.1.1" + class WaczFileCreator: """Handles creating WACZ archives.""" - def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, cdxj_fname: str = "index.cdxj") -> None: + hash_type = "sha256" + datapackage_fname = "datapackage.json" + + def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None: self.store = store self.warc_fname = warc_fname self.cdxj_fname = cdxj_fname self.collection_name = collection_name + self._title = title + self._description = description def create(self) -> None: """Create the WACZ file from the WARC and CDXJ index and save it in the configured store.""" @@ -59,6 +73,7 @@ def create_wacz_zip(self) -> io.BytesIO: with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: self.write_to_zip(zip_file, self.cdxj_fname, "indexes/") self.write_to_zip(zip_file, self.warc_fname, "archive/") + self.write_datapackage(zip_file) return zip_buffer @@ -77,7 +92,80 @@ def cleanup_files(self, *files: str) -> None: def get_wacz_fname(self) -> str: """Generate WACZ filename based on the WARC filename.""" - return f"{self.collection_name}-{get_current_timestamp()}.wacz" + return f"{self.collection_name}-{get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT)}.wacz" + + def write_datapackage(self, zip_file: zipfile.ZipFile) -> None: + """Main function to create and write the datapackage.json.""" + + package_dict = self.create_package_dict() + + with zip_file.open("archive/" + self.warc_fname) as warc_fh: + package_dict = self.update_package_metadata_from_warc(warc_fh, package_dict) + + package_dict["resources"] = self.collect_resources(zip_file) + + zip_file.writestr(self.datapackage_fname, json.dumps(package_dict, indent=2)) + + def create_package_dict(self) -> Dict[str, Any]: + """Creates the initial package dictionary.""" + + dt_string = get_formatted_dt_string(format=WARC_DT_FORMAT) + return { + "profile": "data-package", + "title": self.title, + "description": self.description, + "created": dt_string, + "modified": dt_string, + "wacz_version": WACZ_VERSION, + "software": f"scrapy-webarchive/{scrapy_webarchive_version}, Scrapy/{scrapy_version}", + } + + def update_package_metadata_from_warc(self, warc_fh: IO, package_dict: Dict[str, Any]) -> Dict[str, Any]: + """Updates the package dictionary with metadata from the WARC records.""" + + warc_reader = WARCReader(gzip.open(warc_fh)) if self.warc_fname.endswith(".gz") else WARCReader(warc_fh) + + while True: + warc_record = warc_reader.read_record() + if warc_record is None: + break + + if warc_record.type == "request": + package_dict.update({ + "mainPageUrl": warc_record.url, + "mainPageDate": warc_record.date, + }) + break + + return package_dict + + def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]: + """Collects resource information from the zip file.""" + + resources = [] + + for zip_entry in zip_file.infolist(): + with zip_file.open(zip_entry, "r") as stream: + size, hash_ = hash_stream(self.hash_type, stream) + + resources.append({ + "name": os.path.basename(zip_entry.filename).lower(), + "path": zip_entry.filename, + "hash": hash_, + "bytes": size, + }) + + return resources + + @property + def title(self): + return self._title or self.collection_name + + @property + def description(self): + return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \ + f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \ + "any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript." class WaczFile: diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py index a6866cd..a8a0776 100644 --- a/scrapy_webarchive/warc.py +++ b/scrapy_webarchive/warc.py @@ -18,7 +18,7 @@ from scrapy_webarchive.cdxj import CdxjRecord from scrapy_webarchive.exceptions import WaczMiddlewareException -from scrapy_webarchive.utils import get_current_timestamp, header_lines_to_dict +from scrapy_webarchive.utils import TIMESTAMP_DT_FORMAT, get_formatted_dt_string, header_lines_to_dict def generate_warc_fname(prefix: str) -> str: @@ -28,10 +28,12 @@ def generate_warc_fname(prefix: str) -> str: {prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz """ + timestamp = get_formatted_dt_string(format=TIMESTAMP_DT_FORMAT) crawlhost = socket.gethostname().split(".")[0] # As of now we only generate one WARC file. Add serial in here to adhere to the warc specification. serial = '00000' - return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz" + + return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz" class WARCReader(BaseWARCReader): diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..14cbc35 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.fixture +def warc_example(): + return b"\ +WARC/1.0\r\n\ +Content-Length: 10\r\n\ +WARC-Date: 2024-02-10T16:15:52Z\r\n\ +Content-Type: application/http; msgtype=request\r\n\ +WARC-Type: request\r\n\ +WARC-Record-ID: \r\n\ +WARC-Target-URI: http://example.com/\r\n\ +\r\n\ +Helloworld\ +\r\n\r\n\ +" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..c402fca --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,53 @@ +import hashlib +import io + +import pytest + +from scrapy_webarchive.utils import BUFF_SIZE, hash_stream + + +def test_hash_stream_with_empty_stream(): + # Test with an empty stream + data = b"" + stream = io.BytesIO(data) + size, result = hash_stream("sha256", stream) + + assert size == 0 + assert result == f"sha256:{hashlib.sha256(data).hexdigest()}" + +def test_hash_stream_with_md5_algorithm(): + data = b"Hello world" + expected_hash = hashlib.md5(data).hexdigest() + + stream = io.BytesIO(data) + size, result = hash_stream("md5", stream) + + assert size == len(data) + assert result == f"md5:{expected_hash}" + +def test_hash_stream_with_sha256_algorithm(): + data = b"Hello world" + expected_hash = hashlib.sha256(data).hexdigest() + + stream = io.BytesIO(data) + size, result = hash_stream("sha256", stream) + + assert size == len(data) + assert result == f"sha256:{expected_hash}" + +def test_hash_stream_with_unsupported_hash_type(): + data = b"Hello world" + stream = io.BytesIO(data) + + with pytest.raises(ValueError): + hash_stream("unsupported_hash", stream) + +def test_hash_stream_with_large_stream(): + data = b"a" * (2 * BUFF_SIZE) # Twice the buffer size + expected_hash = hashlib.sha256(data).hexdigest() + + stream = io.BytesIO(data) + size, result = hash_stream("sha256", stream) + + assert size == len(data) + assert result == f"sha256:{expected_hash}" diff --git a/tests/test_wacz.py b/tests/test_wacz.py index ee643c3..8d6da25 100644 --- a/tests/test_wacz.py +++ b/tests/test_wacz.py @@ -1,10 +1,15 @@ import zipfile +from io import BytesIO +from typing import cast from unittest.mock import Mock import pytest from freezegun import freeze_time +from scrapy import __version__ as scrapy_version -from scrapy_webarchive.wacz import WaczFileCreator +from scrapy_webarchive import __version__ as scrapy_webarchive_version +from scrapy_webarchive.extensions import FilesStoreProtocol +from scrapy_webarchive.wacz import WACZ_VERSION, WaczFileCreator class TestWaczFileCreator: @@ -16,15 +21,19 @@ class TestWaczFileCreator: def wacz_file_creator(self): """Fixture to initialize the WaczFileCreator with a mocked store""" + mock_store = cast(FilesStoreProtocol, Mock(spec=FilesStoreProtocol)) + return WaczFileCreator( - store=Mock(), + store=mock_store, warc_fname=self.warc_fname, collection_name=self.collection_name, cdxj_fname=self.cdxj_fname, + title="Testing", + description="WACZ generated durning a unit-test", ) @freeze_time("2024-10-04 08:27:11") - def test_create_wacz(self, fs, wacz_file_creator): + def test_create_wacz(self, fs, wacz_file_creator: WaczFileCreator): # Setup the fake filesystem fs.create_file(self.cdxj_fname, contents="") fs.create_file(self.warc_fname, contents="") @@ -37,16 +46,40 @@ def test_create_wacz(self, fs, wacz_file_creator): # Verify the WACZ file was persisted in the store wacz_fname = wacz_file_creator.get_wacz_fname() - wacz_file_creator.store.persist_file.assert_called_once() + mock_store = cast(Mock, wacz_file_creator.store) + mock_store.persist_file.assert_called_once() # Assert that the correct WACZ filename was used assert wacz_fname == f"{self.collection_name}-20241004082711.wacz" # Retrieve the zip buffer from the call args - call_args = wacz_file_creator.store.persist_file.call_args - zip_buffer = call_args[1]['buf'] + call_args = mock_store.persist_file.call_args + zip_buffer = call_args[1]["buf"] # Verify that the WACZ zip content is correct zip_file = zipfile.ZipFile(zip_buffer) assert f"indexes/{self.cdxj_fname}" in zip_file.namelist() assert f"archive/{self.warc_fname}" in zip_file.namelist() + assert "datapackage.json" in zip_file.namelist() + + @freeze_time("2024-10-04 08:27:11") + def test_create_package_dict(self, wacz_file_creator: WaczFileCreator): + package_dict = wacz_file_creator.create_package_dict() + + expected = { + "profile": "data-package", + "title": "Testing", + "description": "WACZ generated durning a unit-test", + "created": "2024-10-04T08:27:11Z", + "modified": "2024-10-04T08:27:11Z", + "wacz_version": WACZ_VERSION, + "software": f"scrapy-webarchive/{scrapy_webarchive_version}, Scrapy/{scrapy_version}", + } + + assert package_dict == expected + + def test_package_metadata_from_warc(self, wacz_file_creator: WaczFileCreator, warc_example): + res = wacz_file_creator.update_package_metadata_from_warc(BytesIO(warc_example), {}) + + assert res["mainPageUrl"] == "http://example.com/" + assert res["mainPageDate"] == "2024-02-10T16:15:52Z"