Skip to content

Commit

Permalink
Improve typing and add test for warc name generator
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 4, 2024
1 parent df5096f commit 4d18b14
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 16 deletions.
3 changes: 2 additions & 1 deletion requirements-tests.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
-e .
pytest>=8.3,<8.4
freezegun==1.5.1
mypy==1.11.2
ruff==0.6.8
ruff==0.6.8
2 changes: 1 addition & 1 deletion scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(
store,
warc_fname: str,
cdxj_fname: str = "index.cdxj",
):
) -> None:
self.store = store
self.warc_fname = warc_fname
self.cdxj_fname = cdxj_fname
Expand Down
38 changes: 24 additions & 14 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from scrapy import __version__ as scrapy_version
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.responsetypes import ResponseTypes
from warc.warc import WARCRecord
from warcio.recordloader import ArcWarcRecord
Expand All @@ -16,28 +17,37 @@
from scrapy_webarchive.utils import header_lines_to_dict


def create_warc_fname(tla):
def generate_warc_fname(prefix: str) -> str:
"""
Returns new WARC filename. WARC filename format compatible with internetarchive/draintasker warc naming #1:
{TLA}-{timestamp}-{serial}-{fqdn}.warc.gz
Returns new WARC filename based on recommendation in the warc-specification:
https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#annex-c-informative-warc-file-size-and-name-recommendations
{prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz
"""

timestamp = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
fqdn = socket.gethostname().split(".")[0]
return "-".join([tla, timestamp, "00000", fqdn]) + ".warc.gz"
crawlhost = socket.gethostname().split(".")[0]
# As of now we only generate one WARC file. Add serial in here to adhere to the warc specification.
serial = '00000'
return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz"


class WarcFileWriter:
"""Handles writing WARC files"""

def __init__(self, collection_name: str):
def __init__(self, collection_name: str) -> None:
self.collection_name = collection_name
self.warc_fname = create_warc_fname(tla=collection_name)
# TODO: If warc_fname exists, raise
self.warc_fname = generate_warc_fname(prefix=collection_name)

def write_record(
self, url, record_type, headers, warc_headers, content_type, content, http_line
):
self,
url: str,
record_type: str,
headers: list[tuple[str, str]],
warc_headers: StatusAndHeaders,
content_type: str,
content: str,
http_line: str,
) -> ArcWarcRecord:
"""Write any WARC record (response or request) to a WARC file"""

with open(self.warc_fname, "ab") as fh:
Expand All @@ -57,7 +67,7 @@ def write_record(

return record

def write_response(self, response, request):
def write_response(self, response: Response, request: Request) -> ArcWarcRecord:
record_id = self.__record_id()
warc_headers = StatusAndHeaders(
"",
Expand Down Expand Up @@ -88,7 +98,7 @@ def write_response(self, response, request):
)
return record

def write_request(self, request, concurrent_to: ArcWarcRecord):
def write_request(self, request: Request, concurrent_to: ArcWarcRecord):
"""Write a WARC-Type: request record"""

record_id = self.__record_id()
Expand Down Expand Up @@ -122,7 +132,7 @@ def write_request(self, request, concurrent_to: ArcWarcRecord):
)
return record

def write_warcinfo(self):
def write_warcinfo(self) -> None:
"""Write WARC-Type: warcinfo record"""

content = {
Expand All @@ -139,7 +149,7 @@ def write_warcinfo(self):
writer.write_record(record)

@staticmethod
def __record_id():
def __record_id() -> str:
"""Returns WARC-Record-ID (globally unique UUID) as a string"""
return f"<urn:uuid:{uuid.uuid1()}>"

Expand Down
19 changes: 19 additions & 0 deletions tests/test_warc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import socket

from freezegun import freeze_time

from scrapy_webarchive.warc import generate_warc_fname


@freeze_time("2024-10-04 08:27:11")
def test_generate_warc_fname(monkeypatch):
prefix = "rec"

# Use pytest's monkeypatch to mock the return value of socket.gethostname
monkeypatch.setattr(socket, "gethostname", lambda: "example.local")

# Call the function
warc_fname = generate_warc_fname(prefix)

# Assert the result matches the expected filename
assert warc_fname == "rec-20241004082711-00000-example.warc.gz"

0 comments on commit 4d18b14

Please sign in to comment.