diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index d909443..8447b85 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -104,8 +104,8 @@ def response_received(self, response: Response, request: Request, spider: Spider self.writer.write_request(request, concurrent_to=record) self.stats.inc_value("wacz/exporter/request_written", spider=spider) - def spider_closed(self) -> None: - WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store).create() + def spider_closed(self, spider: Spider) -> None: + WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create() def get_archive_uri_template_variables() -> dict: diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py index 7a24ac6..ec78705 100644 --- a/scrapy_webarchive/wacz.py +++ b/scrapy_webarchive/wacz.py @@ -21,10 +21,11 @@ class WARCReader(BaseWARCReader): class WaczFileCreator: """Handles creating WACZ archives""" - def __init__(self, store, warc_fname: str, cdxj_fname: str = "index.cdxj") -> None: + def __init__(self, store, warc_fname: str, collection_name: str, cdxj_fname: str = "index.cdxj") -> None: self.store = store self.warc_fname = warc_fname self.cdxj_fname = cdxj_fname + self.collection_name = collection_name def create(self) -> None: """Create the WACZ file from the WARC and CDXJ index and save it in the configured store""" @@ -68,7 +69,7 @@ def cleanup_files(self, *files: str) -> None: def get_wacz_fname(self) -> str: """Generate WACZ filename based on the WARC filename""" - return f"archive-{get_current_timestamp()}.wacz" + return f"{self.collection_name}-{get_current_timestamp()}.wacz" class WaczFile: diff --git a/tests/test_wacz.py b/tests/test_wacz.py index 49867af..28f08f8 100644 --- a/tests/test_wacz.py +++ b/tests/test_wacz.py @@ -8,33 +8,39 @@ class TestWaczFileCreator: + warc_fname = "example-20241007000000-00000-test.warc" + cdxj_fname = "index.cdxj" + collection_name = "example" + @pytest.fixture def wacz_file_creator(self): """Fixture to initialize the WaczFileCreator with a mocked store""" - store = Mock() - warc_fname = "/scrapy-webarchive/quotes-20241007000000-00000-test.warc" - cdxj_fname = "/scrapy-webarchive/index.cdxj" - return WaczFileCreator(store=store, warc_fname=warc_fname, cdxj_fname=cdxj_fname) + return WaczFileCreator( + store=Mock(), + warc_fname=self.warc_fname, + collection_name=self.collection_name, + cdxj_fname=self.cdxj_fname, + ) @freeze_time("2024-10-04 08:27:11") def test_create_wacz(self, fs, wacz_file_creator): # Setup the fake filesystem - fs.create_file("/scrapy-webarchive/index.cdxj", contents="") - fs.create_file("/scrapy-webarchive/quotes-20241007000000-00000-test.warc", contents="") + fs.create_file(self.cdxj_fname, contents="") + fs.create_file(self.warc_fname, contents="") wacz_file_creator.create() # Ensure the files are removed after creation - assert not fs.exists("/scrapy-webarchive/index.cdxj") - assert not fs.exists("/scrapy-webarchive/quotes-20241007000000-00000-test.warc") + assert not fs.exists(self.cdxj_fname) + assert not fs.exists(self.warc_fname) # Verify the WACZ file was persisted in the store wacz_fname = wacz_file_creator.get_wacz_fname() wacz_file_creator.store.persist_file.assert_called_once() # Assert that the correct WACZ filename was used - assert wacz_fname == "archive-20241004082711.wacz" + assert wacz_fname == f"{self.collection_name}-20241004082711.wacz" # Retrieve the zip buffer from the call args call_args = wacz_file_creator.store.persist_file.call_args @@ -42,5 +48,5 @@ def test_create_wacz(self, fs, wacz_file_creator): # Verify that the WACZ zip content is correct zip_file = zipfile.ZipFile(zip_buffer) - assert "indexes/index.cdxj" in zip_file.namelist() - assert "archive/quotes-20241007000000-00000-test.warc" in zip_file.namelist() + assert f"indexes/{self.cdxj_fname}" in zip_file.namelist() + assert f"archive/{self.warc_fname}" in zip_file.namelist()