Skip to content

Commit

Permalink
Make the output archive name variable based on the spider
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 15, 2024
1 parent da0560f commit 89fecc2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 15 deletions.
4 changes: 2 additions & 2 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ def response_received(self, response: Response, request: Request, spider: Spider
self.writer.write_request(request, concurrent_to=record)
self.stats.inc_value("wacz/exporter/request_written", spider=spider)

def spider_closed(self) -> None:
WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store).create()
def spider_closed(self, spider: Spider) -> None:
WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create()


def get_archive_uri_template_variables() -> dict:
Expand Down
5 changes: 3 additions & 2 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ class WARCReader(BaseWARCReader):
class WaczFileCreator:
"""Handles creating WACZ archives"""

def __init__(self, store, warc_fname: str, cdxj_fname: str = "index.cdxj") -> None:
def __init__(self, store, warc_fname: str, collection_name: str, cdxj_fname: str = "index.cdxj") -> None:
self.store = store
self.warc_fname = warc_fname
self.cdxj_fname = cdxj_fname
self.collection_name = collection_name

def create(self) -> None:
"""Create the WACZ file from the WARC and CDXJ index and save it in the configured store"""
Expand Down Expand Up @@ -68,7 +69,7 @@ def cleanup_files(self, *files: str) -> None:
def get_wacz_fname(self) -> str:
"""Generate WACZ filename based on the WARC filename"""

return f"archive-{get_current_timestamp()}.wacz"
return f"{self.collection_name}-{get_current_timestamp()}.wacz"


class WaczFile:
Expand Down
28 changes: 17 additions & 11 deletions tests/test_wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,39 +8,45 @@


class TestWaczFileCreator:
warc_fname = "example-20241007000000-00000-test.warc"
cdxj_fname = "index.cdxj"
collection_name = "example"

@pytest.fixture
def wacz_file_creator(self):
"""Fixture to initialize the WaczFileCreator with a mocked store"""

store = Mock()
warc_fname = "/scrapy-webarchive/quotes-20241007000000-00000-test.warc"
cdxj_fname = "/scrapy-webarchive/index.cdxj"
return WaczFileCreator(store=store, warc_fname=warc_fname, cdxj_fname=cdxj_fname)
return WaczFileCreator(
store=Mock(),
warc_fname=self.warc_fname,
collection_name=self.collection_name,
cdxj_fname=self.cdxj_fname,
)

@freeze_time("2024-10-04 08:27:11")
def test_create_wacz(self, fs, wacz_file_creator):
# Setup the fake filesystem
fs.create_file("/scrapy-webarchive/index.cdxj", contents="")
fs.create_file("/scrapy-webarchive/quotes-20241007000000-00000-test.warc", contents="")
fs.create_file(self.cdxj_fname, contents="")
fs.create_file(self.warc_fname, contents="")

wacz_file_creator.create()

# Ensure the files are removed after creation
assert not fs.exists("/scrapy-webarchive/index.cdxj")
assert not fs.exists("/scrapy-webarchive/quotes-20241007000000-00000-test.warc")
assert not fs.exists(self.cdxj_fname)
assert not fs.exists(self.warc_fname)

# Verify the WACZ file was persisted in the store
wacz_fname = wacz_file_creator.get_wacz_fname()
wacz_file_creator.store.persist_file.assert_called_once()

# Assert that the correct WACZ filename was used
assert wacz_fname == "archive-20241004082711.wacz"
assert wacz_fname == f"{self.collection_name}-20241004082711.wacz"

# Retrieve the zip buffer from the call args
call_args = wacz_file_creator.store.persist_file.call_args
zip_buffer = call_args[0][1]

# Verify that the WACZ zip content is correct
zip_file = zipfile.ZipFile(zip_buffer)
assert "indexes/index.cdxj" in zip_file.namelist()
assert "archive/quotes-20241007000000-00000-test.warc" in zip_file.namelist()
assert f"indexes/{self.cdxj_fname}" in zip_file.namelist()
assert f"archive/{self.warc_fname}" in zip_file.namelist()

0 comments on commit 89fecc2

Please sign in to comment.