Skip to content

Commit

Permalink
unit-test: add tests for donwloadermiddleware, extension and wacz fil…
Browse files Browse the repository at this point in the history
…e handler
  • Loading branch information
Wesley van Lee committed Oct 14, 2024
1 parent 8a58867 commit c15e3d3
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 36 deletions.
20 changes: 0 additions & 20 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,6 @@ def iter_index(self):
for r in f.iter_index():
yield {**r, "_wacz_file": f}

def iter_warc(self):
for f in self.waczs:
for r in f.iter_warc():
yield r


class WaczFile:
"""
Expand Down Expand Up @@ -163,21 +158,6 @@ def iter_index(self):
for record in records:
yield record

def iter_warc(self):
for entry in self.wacz_file.infolist():
if entry.is_dir():
continue

if not entry.filename.startswith("archive/"):
continue

warc_file = self.wacz_file.open(entry)
if entry.filename.endswith(".gz"):
warc_file = gzip.open(warc_file)

reader = WARCReader(warc_file)
for record in reader:
yield record

@staticmethod
def _get_index(wacz_file):
Expand Down
Binary file added tests/data/warc_1_1/goodreads.wacz.gz
Binary file not shown.
44 changes: 36 additions & 8 deletions tests/test_downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,40 +10,68 @@
from . import get_test_data_path


class TestWaczMiddleware:
class BaseTestWaczMiddleware:
def setup_method(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider("quotes")

def _get_wacz_source_url(self) -> str:
"""Return the WACZ source URL for this test. Override in subclasses."""

raise NotImplementedError

def _get_settings(self, **new_settings):
settings = {
"SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
"SW_WACZ_SOURCE_URL": self._get_wacz_source_url(),
"SW_WACZ_CRAWL": False,
"SW_WACZ_TIMEOUT": 60,
}
settings.update(new_settings)
return Settings(settings)

@contextmanager
def _middleware(self, **new_settings):
settings = self._get_settings(**new_settings)
mw = WaczMiddleware(settings, self.crawler.stats)
mw.spider_opened(self.spider)
yield mw


class TestWaczMiddleware(BaseTestWaczMiddleware):
def _get_wacz_source_url(self):
return get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri()

def test_retrieve_from_wacz_record_not_found(self):
request = Request("http://www.example.com/")
with self._middleware() as mw:
response = mw.process_request(request, self.spider)
assert response
assert response.status == 404

def test_retrieve_from_wacz(self):
# Response for the URL exists in the WACZ archive.
request = Request("https://quotes.toscrape.com/tag/love/")

with self._middleware() as mw:
response = mw.process_request(request, self.spider)
assert response
assert response.status == 200

def test_retrieve_from_wacz_record_not_found(self):
request = Request("https://example.com/")

class TestWaczMiddlewareMultiWacz(BaseTestWaczMiddleware):
def _get_wacz_source_url(self):
wacz_1 = get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri()
wacz_2 = get_test_data_path("warc_1_1", "goodreads.wacz.gz").as_uri()
return f'{wacz_1},{wacz_2}'

def test_retrieve_from_first_wacz(self):
request = Request("https://quotes.toscrape.com/tag/love/")
with self._middleware() as mw:
response = mw.process_request(request, self.spider)
assert response
assert response.status == 404
assert response.status == 200

def test_retrieve_from_second_wacz(self):
request = Request("https://www.goodreads.com/quotes")
with self._middleware() as mw:
response = mw.process_request(request, self.spider)
assert response
assert response.status == 200
23 changes: 23 additions & 0 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.pipelines.files import FSFilesStore, FTPFilesStore, GCSFilesStore, S3FilesStore
from scrapy.utils.test import get_crawler

Expand Down Expand Up @@ -38,3 +39,25 @@ def test_get_store(self, *args):
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
assert isinstance(extension.store, FTPFilesStore)

def test_response_received(self):
crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"})
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
extension.writer = mock.Mock()

# Call the method under test
request = Request("http://example.com")
response = Response(request.url)
extension.response_received(response, request, crawler.spider)

# Verify that the WARC date was set in request meta
assert "WARC-Date" in request.meta

# Verify that the response and request were written to the WARC file
extension.writer.write_response.assert_called_once()
extension.writer.write_request.assert_called_once()

# Verify that the stats were incremented correctly
assert extension.stats._stats['wacz/exporter/response_written'] == 1
assert extension.stats._stats['wacz/exporter/request_written'] == 1
16 changes: 8 additions & 8 deletions tests/test_warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@

@freeze_time("2024-10-04 08:27:11")
def test_generate_warc_fname(monkeypatch):
prefix = "rec"
prefix = "example"
monkeypatch.setattr(socket, "gethostname", lambda: "example.local")
assert generate_warc_fname(prefix) == "rec-20241004082711-00000-example.warc.gz"
assert generate_warc_fname(prefix) == "example-20241004082711-00000-example.warc.gz"


@pytest.fixture
def warc_record_response():
payload = b"""HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>"""
return WARCRecord(payload=payload, headers={"WARC-Target-URI": "https://quotes.toscrape.com/"})
return WARCRecord(payload=payload, headers={"WARC-Target-URI": "http://example.com"})


@pytest.fixture
Expand All @@ -35,18 +35,18 @@ def warc_record_request():
class TestWarcRecordTransformer:
def test_request_for_record(self):
record = {
"url": "https://quotes.toscrape.com/",
"url": "http://example.com",
"mime": "text/html",
"status": "200",
"digest": "sha1:AA7J5JETQ4H7GG22MU2NCAUO6LM2EPEU",
"length": "2302",
"offset": "384",
"filename": "quotes-20241007095844-00000-BA92-CKXFG4FF6H.warc.gz",
"filename": "example-20241007095844-00000-BA92-CKXFG4FF6H.warc.gz",
}

request = record_transformer.request_for_record(record)
assert isinstance(request, Request)
assert request.url == "https://quotes.toscrape.com/"
assert request.url == "http://example.com"
assert request.method == "GET"

def test_response_for_record_invalid_response_type(self, warc_record_request):
Expand All @@ -56,15 +56,15 @@ def test_response_for_record_invalid_response_type(self, warc_record_request):
def test_response_for_record(self, warc_record_response):
response = record_transformer.response_for_record(warc_record_response)
assert isinstance(response, HtmlResponse)
assert response.url == "https://quotes.toscrape.com/"
assert response.url == "http://example.com"
assert response.status == 200
assert response.body == b'<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>'


UTF8_PAYLOAD = u'\
HTTP/1.0 200 OK\r\n\
Content-Type: text/plain; charset="UTF-8"\r\n\
Content-Disposition: attachment; filename="испытание.txt"\r\n\
Content-Disposition: attachment; filename="example.txt"\r\n\
Custom-Header: somevalue\r\n\
Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
\r\n\
Expand Down

0 comments on commit c15e3d3

Please sign in to comment.