Skip to content

Commit

Permalink
Merge pull request #68 from RyouMon/feature-fast-crawl
Browse files Browse the repository at this point in the history
Shutdown spiders if collection not updated. Add ID to ComicBookInfo. Change filename of nhentai comics.
close #54
  • Loading branch information
RyouMon authored Dec 19, 2024
2 parents feca3cb + 696a075 commit 7f77961
Show file tree
Hide file tree
Showing 30 changed files with 589 additions and 120 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
tests/.trial_temp/

# Translations
*.mo
Expand Down
22 changes: 14 additions & 8 deletions src/favorites_crawler/commands/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from favorites_crawler.constants.domains import LMMPIC_DOMAIN, NHENTAI_DOMAIN, TWITTER_DOMAIN
from favorites_crawler.utils.config import load_config, overwrite_spider_settings
from favorites_crawler.constants.path import DEFAULT_FAVORS_HOME
from favorites_crawler.utils.common import get_favors_home
from favorites_crawler.utils.auth import refresh_pixiv_token
from favorites_crawler.utils.cookies import load_cookie

Expand All @@ -30,15 +30,19 @@ def crawl_yandere():
@app.command('pixiv')
def crawl_pixiv():
"""Crawl your favorite illustrations from pixiv."""
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
access_token = refresh_pixiv_token(favors_home)
favors_home = get_favors_home()
try:
access_token = refresh_pixiv_token(favors_home)
except Exception as e:
print(e)
exit(1)
crawl('pixiv', access_token=access_token)


@app.command('nhentai')
def crawl_nhentai():
"""Crawl your favorite comics from nhentai."""
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
favors_home = get_favors_home()
cookies = load_cookie(NHENTAI_DOMAIN, favors_home)
crawl('nhentai', cookies=cookies)

Expand All @@ -47,15 +51,15 @@ def crawl_nhentai():
@app.command('twitter')
def crawl_twitter():
"""Crawl your favorite pictures from twitter."""
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
favors_home = get_favors_home()
cookies = load_cookie(TWITTER_DOMAIN, favors_home)
crawl('twitter', cookies=cookies)


@app.command('lemon')
def crawl_lemon(id_list: list[str] = typer.Option([], '--id', '-i')):
"""Crawl your favorite photo albums from lemon."""
favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME))
favors_home = get_favors_home()
cookies = load_cookie(LMMPIC_DOMAIN, favors_home)
crawl('lemon', id_list=id_list, cookies=cookies)

Expand All @@ -66,7 +70,9 @@ def spider_closed(spider):
print('Dumping Scrapy stats:', stats)
if spider.name == 'yandere_vote':
return
if not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
if stats.get('finish_reason') == 'fastly-finished':
return
elif not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
print(Panel(
'[red]Nothing was crawled, your cookies or token may have expired.',
border_style="red",
Expand All @@ -82,7 +88,7 @@ def crawl(name, **kwargs):
:param kwargs: kwargs passed to spider's __init__ method
"""
spider = spider_loader.load(name)
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
favors_home = get_favors_home()
overwrite_spider_settings(spider, favors_home, load_config(favors_home))
process = CrawlerProcess(scrapy_settings)
process.crawl(spider, **kwargs)
Expand Down
12 changes: 6 additions & 6 deletions src/favorites_crawler/commands/login.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import os
import shutil
from typing import Optional

import typer

from favorites_crawler.utils.auth import CustomGetPixivToken, parse_twitter_likes_url, parser_twitter_likes_features
from favorites_crawler.utils.config import dump_config, load_config
from favorites_crawler.constants.path import DEFAULT_FAVORS_HOME
from favorites_crawler.utils.common import get_favors_home


app = typer.Typer(help='Prepare auth information for crawling.', no_args_is_help=True)
Expand All @@ -33,10 +32,11 @@ def login_pixiv(
If you do not provide your username and password, you will login manually on the web page
"""
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
favors_home = get_favors_home()
config = load_config(favors_home)
token_getter = CustomGetPixivToken()
try:
print('Launching chrome...')
login_info = token_getter.login(username=username, password=password)
except Exception as e:
print(f'Failed to login. {e!r}')
Expand Down Expand Up @@ -65,7 +65,7 @@ def login_yandere(
"""
Login to yandere.
"""
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
favors_home = get_favors_home()
config = load_config(favors_home)
yandere_config = config.setdefault('yandere', {})
yandere_config['USERNAME'] = username
Expand Down Expand Up @@ -104,7 +104,7 @@ def login_twitter(
6. Copy Authorization, X-Csrf-Token and RequestURL from request(Likes?variables...) input on terminal.\n
7. Use "Get cookies.txt" browser extension download cookie file.
"""
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
favors_home = get_favors_home()
config = load_config(favors_home)
twitter_config = config.setdefault('twitter', {})
try:
Expand Down Expand Up @@ -140,7 +140,7 @@ def login_nhentai(
4. Copy user-agent from any request.\n
5. Use "Get cookies.txt" browser extension download cookie file.
"""
favors_home = os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)
favors_home = get_favors_home()
config = load_config(favors_home)
nhentai_config = config.setdefault('nhentai', {})
try:
Expand Down
2 changes: 2 additions & 0 deletions src/favorites_crawler/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class LoginFailed(Exception):
pass
8 changes: 2 additions & 6 deletions src/favorites_crawler/itemloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
from itemloaders.processors import Compose, MapCompose

from favorites_crawler import items
from favorites_crawler.processors import take_first, identity, get_nhentai_id, wrap_credits, \
original_url_from_nhentai_thumb_url, select_best_nhentai_title, clean_nhentai_title, \
get_year_from_iso_format, get_month_from_iso_format, get_series_from_title, get_volume_from_title, \
clean_parodies, get_lemon_page, get_pixiv_tags, get_yandere_tags, get_twitter_tags, fix_tweet_media_url, \
tweet_time_2_datetime
from favorites_crawler.processors import *
from favorites_crawler.utils.text import convert_to_ascii


Expand Down Expand Up @@ -46,7 +42,7 @@ class NHentaiGalleryItemLoader(BaseItemLoader):
series_out = Compose(take_first, get_series_from_title)
volume_out = Compose(take_first, get_volume_from_title)
title_out = Compose(select_best_nhentai_title, clean_nhentai_title)
sort_title_out = Compose(select_best_nhentai_title, clean_nhentai_title)
sort_title_out = join_nhentai_title
file_urls_out = MapCompose(original_url_from_nhentai_thumb_url)
credits_out = wrap_credits
publicationYear_out = Compose(take_first, get_year_from_iso_format)
Expand Down
46 changes: 27 additions & 19 deletions src/favorites_crawler/items.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import datetime
from __future__ import annotations

import os.path
from datetime import datetime, date
from dataclasses import dataclass, field, fields
from urllib.parse import unquote, urlparse

Expand All @@ -15,7 +16,7 @@ class BaseItem:
file_urls: list = field(default=None)
tags: list = field(default=None)
referer: str = field(default=None)
created_time: datetime.datetime = field(default=None)
created_time: datetime = field(default=None)

def get_filepath(self, url, spider):
folder_name = self.get_folder_name(spider)
Expand All @@ -30,12 +31,13 @@ def get_filename(self, url, spider):
def get_folder_name(self, spider):
name = self.title
if not name:
name = str(datetime.date.today())
name = str(date.today())
return drop_illegal_characters(name)


@dataclass
class ComicBookInfoItem:
id: int = field(default=None, metadata={'is_ext_comic_info': True})
title: str = field(default=None, metadata={'is_comic_info': True})
series: str = field(default=None, metadata={'is_comic_info': True})
publisher: str = field(default=None, metadata={'is_comic_info': True})
Expand All @@ -53,21 +55,26 @@ class ComicBookInfoItem:
tags: list = field(default=None, metadata={'is_comic_info': True})
comments: str = field(default=None, metadata={'is_comic_info': True})

def get_comic_info(self):
comic_book_info = {}
for f in fields(self):
if not f.metadata.get('is_comic_info', False):
continue
val = getattr(self, f.name)
if not val:
continue
comic_book_info[f.name] = val

return json.dumps({
def get_comic_info(self) -> dict:
metadata = {
'appID': f'FavoritesCrawler',
'lastModified': str(datetime.datetime.now()),
'ComicBookInfo/1.0': comic_book_info,
}, ensure_ascii=False)
'lastModified': str(datetime.now()),
'ComicBookInfo/1.0': {},
'x-FavoritesCrawler': {},
}
comic_book_info = metadata['ComicBookInfo/1.0']
ext_info = metadata['x-FavoritesCrawler']
for field_ in fields(self):
if field_.metadata.get('is_comic_info', False):
value = getattr(self, field_.name)
if value:
comic_book_info[field_.name] = value
elif field_.metadata.get('is_ext_comic_info', False):
value = getattr(self, field_.name)
if value:
ext_info[field_.name] = value

return metadata


@dataclass
Expand Down Expand Up @@ -120,11 +127,12 @@ def get_filename(self, url, spider):

@dataclass
class NHentaiGalleryItem(BaseItem, ComicBookInfoItem):
id: int = field(default=None, metadata={'is_ext_comic_info': True})
title: str = field(default=None, metadata={'is_comic_info': True})
tags: list = field(default=None, metadata={'is_comic_info': True})
parodies: str = field(default=None)
characters: list = field(default=None)
sort_title: str = field(default=None)

def get_folder_name(self, _):
return drop_illegal_characters(self.sort_title)
return drop_illegal_characters(self.sort_title) + f' ({self.id})'
15 changes: 7 additions & 8 deletions src/favorites_crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,24 +102,23 @@ class ComicPipeline(BasePipeline):
def __init__(self, store_uri, **kwargs):
super().__init__(store_uri, **kwargs)
self.files_path = Path(store_uri).resolve()
self.comic_comments = {}
self.comics = {}

def close_spider(self, spider):
for title, comment in self.comic_comments.items():
for title, comic_info in self.comics.items():
folder = self.files_path / title
if not folder.exists():
continue
try:
create_comic_archive(folder, comment=comment)
except FileNotFoundError:
pass
create_comic_archive(folder, comic_info=comic_info)
except Exception as e:
spider.logger.error('Failed to create cbz file: %r', e)

def process_item(self, item, spider):
if hasattr(item, 'get_comic_info'):
title = item.get_folder_name(spider)
if (self.files_path / f'{title}.cbz').exists():
raise DropItem(f'Comic file of "{title}" already exist, stop download this comic.')
comment = item.get_comic_info()
self.comic_comments[title] = bytes(comment, encoding='utf-8')
raise DropItem(f'Comic "{title}" already exist, stop downloading this comic.')
self.comics[title] = item.get_comic_info()

return super().process_item(item, spider)
6 changes: 6 additions & 0 deletions src/favorites_crawler/processors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import re
from datetime import datetime

Expand Down Expand Up @@ -163,3 +165,7 @@ def fix_tweet_media_url(url):

def tweet_time_2_datetime(tweet_time):
return datetime.strptime(tweet_time, '%a %b %d %H:%M:%S %z %Y')


def join_nhentai_title(parts: list[str]) -> str:
return ' '.join(map(lambda s: s.strip(), parts))
39 changes: 39 additions & 0 deletions src/favorites_crawler/spiders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,47 @@

from abc import ABCMeta

from scrapy.exceptions import CloseSpider
from scrapy.spiders import CrawlSpider

from favorites_crawler.utils.common import get_favors_home
from favorites_crawler.utils.config import load_config, dump_config


class BaseSpider(CrawlSpider, metaclass=ABCMeta):
custom_settings = {}
cookies = None

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.last_bookmark_id = self.custom_settings.get('LAST_BOOKMARK_ID')
self.last_bookmark_id_updated = False

def close_spider_when_bookmark_not_updated(self, response, **kwargs):
"""Close spider when bookmark not updated"""
last_bookmark_id = self.get_last_bookmark_id(response, **kwargs)
self._close_spider_when_bookmark_not_updated(last_bookmark_id)
self.update_last_bookmark_id(last_bookmark_id)

def get_last_bookmark_id(self, response, **kwargs):
"""Get last bookmark id from start_url response"""
raise NotImplementedError()

def _close_spider_when_bookmark_not_updated(self, bookmark_id):
"""Close spider when current bookmark id equals to last bookmark id."""
if self.last_bookmark_id and (self.last_bookmark_id == bookmark_id):
self.logger.info('Bookmark not updated, closing spider.')
raise CloseSpider('fastly-finished')

def update_last_bookmark_id(self, bookmark_id):
"""Update last bookmark id"""
if not bookmark_id or self.last_bookmark_id_updated:
return
self.last_bookmark_id = bookmark_id
self.last_bookmark_id_updated = True
favors_home = get_favors_home()
config = load_config(favors_home)
spider_config = config.setdefault(self.name, {})
spider_config['LAST_BOOKMARK_ID'] = bookmark_id
dump_config(config, favors_home)
self.logger.info('Updated LAST_BOOKMARK_ID: %s', bookmark_id)
Loading

0 comments on commit 7f77961

Please sign in to comment.