Skip to content

Commit

Permalink
Handle file not found for spider and downloader middlewares and impro…
Browse files Browse the repository at this point in the history
…ve type hints
  • Loading branch information
Wesley van Lee committed Oct 15, 2024
1 parent f616f1c commit e0dea76
Show file tree
Hide file tree
Showing 7 changed files with 200 additions and 152 deletions.
34 changes: 25 additions & 9 deletions scrapy_webarchive/cdxj.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,48 @@
import json
import re
from typing import List
from dataclasses import dataclass, field
from typing import Any, List

from cdxj_indexer.main import CDXJIndexer

# based on https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/parser.py
CDXREC = re.compile(
r"^(?P<surt>(?P<host>[^\)\s]+)\)(?P<path>[^\?\s]+)?(\?(?P<query>\S+))?)"
r"\s(?P<datetime>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})(?:\d{3})?)"
r"\s(?P<data>{.*})"
)


@dataclass
class CdxjRecord:
def _parse(self, line):
wacz_file: Any
surt: str
host: str
path: str = ""
query: str = ""
datetime: str = ""
year: str = ""
month: str = ""
day: str = ""
hour: str = ""
minute: str = ""
second: str = ""
data: dict = field(default_factory=dict)

@staticmethod
def _parse(line: str):
return CDXREC.match(line)

def __init__(self, cdxline):
m = self._parse(cdxline.strip())
@classmethod
def from_cdxline(cls, cdxline: str, wacz_file):
m = cls._parse(cdxline.strip())

if not m:
raise ValueError(f"Invalid CDXJ line: '{cdxline.strip()}'")

for key, value in m.groupdict(default="").items():
if key == "data":
value = json.loads(value)
parsed_data = m.groupdict(default="")
parsed_data['data'] = json.loads(parsed_data['data'])

setattr(self, key, value)
return cls(**parsed_data, wacz_file=wacz_file)

def __str__(self):
return str(self.__dict__)
Expand Down
74 changes: 50 additions & 24 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import IO, List

from scrapy import signals
from scrapy.crawler import Crawler
Expand All @@ -9,7 +10,7 @@
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from smart_open import open
from typing_extensions import Self, Union
from typing_extensions import Self

from scrapy_webarchive.wacz import MultiWaczFile, WaczFile
from scrapy_webarchive.warc import record_transformer
Expand All @@ -22,6 +23,8 @@ class WaczMiddleware:
Loads the index fully into memory, but lazily loads pages.
This helps to work with large archives, including remote ones.
"""

wacz: WaczFile | MultiWaczFile

def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
Expand All @@ -43,22 +46,40 @@ def from_crawler(cls, crawler: Crawler) -> Self:

def spider_opened(self, spider: Spider) -> None:
tp = {"timeout": self.timeout}
self.wacz: Union[WaczFile, MultiWaczFile]

if len(self.wacz_urls) == 1:
spider.logger.info(f"[WACZDownloader] Opening WACZ {self.wacz_urls[0]}")
self.wacz = WaczFile(open(self.wacz_urls[0], "rb", transport_params=tp))
multiple_entries = len(self.wacz_urls) != 1

def open_wacz_file(wacz_url: str) -> IO[bytes] | None:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")

try:
return open(wacz_url, "rb", transport_params=tp)
except OSError:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_url}")
return None

if not multiple_entries:
wacz_url = self.wacz_urls[0]
wacz_file = open_wacz_file(wacz_url)
if wacz_file:
self.wacz = WaczFile(wacz_file)
else:
spider.logger.info(f"[WACZDownloader] Opening WACZs {self.wacz_urls}")
self.wacz = MultiWaczFile(
[open(u, "rb", transport_params=tp) for u in self.wacz_urls]
)
wacz_files: List[IO[bytes]] = []

for wacz_url in self.wacz_urls:
wacz_file = open_wacz_file(wacz_url)
if wacz_file:
wacz_files.append(wacz_file)

if wacz_files:
self.wacz = MultiWaczFile(wacz_files)

def process_request(self, request: Request, spider: Spider):
if not hasattr(self, 'wacz'):
self.stats.set_value("wacz/no_valid_sources", True, spider=spider)
raise IgnoreRequest()

# ignore blacklisted pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
if hasattr(spider, "archive_blacklist_regexp") and re.search(
spider.archive_blacklist_regexp, request.url
):
if hasattr(spider, "archive_blacklist_regexp") and re.search(spider.archive_blacklist_regexp, request.url):
self.stats.inc_value("wacz/request_blacklisted", spider=spider)
raise IgnoreRequest()

Expand All @@ -68,17 +89,22 @@ def process_request(self, request: Request, spider: Spider):
raise IgnoreRequest()

# get record from existing index entry, or else lookup by URL
record = self.wacz.get_record(request.meta.get("wacz_index_entry", request.url))
if record:
response = record_transformer.response_for_record(record)

if not response:
self.stats.inc_value("wacz/response_not_recognized", spider=spider)
raise IgnoreRequest()

self.stats.inc_value("wacz/hit", spider=spider)
return response
if request.meta.get("cdxj_record"):
warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])
else:
# when page not found in archive, return 404, and record it in a statistic
warc_record = self.wacz.get_warc_from_url(url=request.url)

# When page not found in archive, return 404, and record it in a statistic
if not warc_record:
self.stats.inc_value("wacz/response_not_found", spider=spider)
return Response(url=request.url, status=404)

# Record found
response = record_transformer.response_for_record(warc_record)

if not response:
self.stats.inc_value("wacz/response_not_recognized", spider=spider)
raise IgnoreRequest()

self.stats.inc_value("wacz/hit", spider=spider)
return response
47 changes: 34 additions & 13 deletions scrapy_webarchive/middleware.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import IO, List
from urllib.parse import urlparse

from scrapy import Request, Spider, signals
Expand All @@ -7,13 +8,15 @@
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector
from smart_open import open
from typing_extensions import Iterable, Self, Union
from typing_extensions import Iterable, Self

from scrapy_webarchive.wacz import MultiWaczFile, WaczFile
from scrapy_webarchive.warc import record_transformer


class WaczCrawlMiddleware:
wacz: WaczFile | MultiWaczFile

def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
wacz_url = settings.get("SW_WACZ_SOURCE_URL", None)
Expand All @@ -37,24 +40,42 @@ def spider_opened(self, spider: Spider) -> None:
return

tp = {"timeout": self.timeout}
self.wacz: Union[WaczFile, MultiWaczFile]
multiple_entries = len(self.wacz_urls) != 1

def open_wacz_file(wacz_url: str) -> IO[bytes] | None:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")

try:
return open(wacz_url, "rb", transport_params=tp)
except OSError:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_url}")
return None

if len(self.wacz_urls) == 1:
spider.logger.info(f"[WACZDownloader] Opening WACZ {self.wacz_urls[0]}")
self.wacz = WaczFile(open(self.wacz_urls[0], "rb", transport_params=tp))
if not multiple_entries:
wacz_url = self.wacz_urls[0]
wacz_file = open_wacz_file(wacz_url)
if wacz_file:
self.wacz = WaczFile(wacz_file)
else:
spider.logger.info(f"[WACZDownloader] Opening WACZs {self.wacz_urls}")
self.wacz = MultiWaczFile(
[open(u, "rb", transport_params=tp) for u in self.wacz_urls]
)
wacz_files: List[IO[bytes]] = []

for wacz_url in self.wacz_urls:
wacz_file = open_wacz_file(wacz_url)
if wacz_file:
wacz_files.append(wacz_file)

if wacz_files:
self.wacz = MultiWaczFile(wacz_files)

def process_start_requests(self, start_requests: Iterable[Request], spider: Spider):
if not self.crawl:
if not self.crawl or not hasattr(self, 'wacz'):
for request in start_requests:
yield request
else: # ignore original start requests, just yield all responses found

# Ignore original start requests, just yield all responses found
else:
for entry in self.wacz.iter_index():
url = entry["url"]
url = entry.data["url"]

# filter out off-site responses
if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains:
Expand All @@ -71,6 +92,6 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"wacz_index_entry": entry},
meta={"cdxj_record": entry},
dont_filter=True,
)
Loading

0 comments on commit e0dea76

Please sign in to comment.