Skip to content

Commit

Permalink
Add additional variable for the archive output URI
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 22, 2024
1 parent 6ebb64b commit 26204d5
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

```python
SW_EXPORT_URI = "s3://scrapy-webarchive/"
SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/"
SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}/"
```

This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path.
Expand Down
11 changes: 7 additions & 4 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,15 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:
if not self.settings["SW_EXPORT_URI"]:
raise NotConfigured

self.store = self._get_store()
self.store = self._get_store(spider_name=crawler.spider.name)
self.writer = WarcFileWriter(collection_name=crawler.spider.name)

def _get_store(self):
def _get_store(self, spider_name: str):
archive_uri_template = self.settings["SW_EXPORT_URI"]
uri = archive_uri_template.format(**get_archive_uri_template_variables())
uri = archive_uri_template.format(**{
"spider": spider_name,
**get_archive_uri_template_dt_variables(),
})
store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)]
return store_cls(uri)

Expand Down Expand Up @@ -105,7 +108,7 @@ def spider_closed(self, spider: Spider) -> None:
WaczFileCreator(store=self.store, warc_fname=self.writer.warc_fname, collection_name=spider.name).create()


def get_archive_uri_template_variables() -> dict:
def get_archive_uri_template_dt_variables() -> dict:
current_date = datetime.now()

return {
Expand Down

0 comments on commit 26204d5

Please sign in to comment.