From 65ae11164f8551119737b86ece077f345a499ac9 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 21 May 2023 17:07:05 +0800 Subject: [PATCH 1/4] feat(spiders/yandere): can organize yandere post by artist --- src/favorites_crawler/constants/endpoints.py | 3 +- src/favorites_crawler/itemloaders.py | 1 + src/favorites_crawler/items.py | 8 +++-- src/favorites_crawler/spiders/yandere.py | 33 +++++++++++++------- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/favorites_crawler/constants/endpoints.py b/src/favorites_crawler/constants/endpoints.py index 5211592..33c5cc8 100644 --- a/src/favorites_crawler/constants/endpoints.py +++ b/src/favorites_crawler/constants/endpoints.py @@ -3,7 +3,8 @@ PIXIV_LOGIN_URL = "https://app-api.pixiv.net/web/v1/login" PIXIV_AUTH_TOKEN_URL = "https://oauth.secure.pixiv.net/auth/token" -YANDERE_POST_URL = 'https://yande.re/post.json' +YANDERE_LIST_POST_URL = 'https://yande.re/post.json' +YANDERE_SHOW_POST_URL = 'https://yande.re/post/show/{id}' LEMON_PIC_USER_FAVORITES_URL = 'https://www.lmmbtc.com/user-center' diff --git a/src/favorites_crawler/itemloaders.py b/src/favorites_crawler/itemloaders.py index 0c1e8cd..29e1cda 100644 --- a/src/favorites_crawler/itemloaders.py +++ b/src/favorites_crawler/itemloaders.py @@ -23,6 +23,7 @@ class YanderePostItemLoader(ItemLoader): default_output_processor = take_first file_urls_out = identity + artist_out = Compose(take_first, lambda s: s.strip()) class NHentaiGalleryItemLoader(ItemLoader): diff --git a/src/favorites_crawler/items.py b/src/favorites_crawler/items.py index c6fccb7..7071f60 100644 --- a/src/favorites_crawler/items.py +++ b/src/favorites_crawler/items.py @@ -82,8 +82,12 @@ def get_folder_name(self, spider): @dataclass class YanderePostItem(BaseItem): - def get_folder_name(self, _): - return '' + artist: str = field(default=None) + + def get_folder_name(self, spider): + if not spider.crawler.settings.getbool('ENABLE_ORGANIZE_BY_ARTIST'): + return '' + return self.artist or 'unknown' @dataclass diff --git a/src/favorites_crawler/spiders/yandere.py b/src/favorites_crawler/spiders/yandere.py index fe0411e..cbff720 100644 --- a/src/favorites_crawler/spiders/yandere.py +++ b/src/favorites_crawler/spiders/yandere.py @@ -6,7 +6,7 @@ from favorites_crawler.spiders import BaseSpider from favorites_crawler.constants.domains import YANDERE_DOMAIN from favorites_crawler.itemloaders import YanderePostItemLoader -from favorites_crawler.constants.endpoints import YANDERE_POST_URL +from favorites_crawler.constants.endpoints import YANDERE_LIST_POST_URL, YANDERE_SHOW_POST_URL class YandereSpider(BaseSpider): @@ -28,26 +28,35 @@ def start_requests(self): raise CloseSpider('Did you run "favors login yandere"?') self.params['tags'] = f'vote:>=1:{username}' - yield Request(f'{YANDERE_POST_URL}?{urlencode(self.params)}') + yield Request(f'{YANDERE_LIST_POST_URL}?{urlencode(self.params)}') def parse_start_url(self, response, **kwargs): - for request_or_item in self.parse(response, **kwargs): - yield request_or_item - - def parse(self, response, **kwargs): - """Spider Contracts: + """Parse list post url @url https://yande.re/post.json?limit=100&page=1 - @returns item 100 - @returns requests 1 - @scrapes file_urls + @returns requests 101 """ posts = response.json() if len(posts) == self.limit: self.params['page'] += 1 - yield Request(f'{YANDERE_POST_URL}?{urlencode(self.params)}') + yield Request(f'{YANDERE_LIST_POST_URL}?{urlencode(self.params)}', callback=self.parse_start_url) for post in posts: loader = YanderePostItemLoader() loader.add_value('file_urls', post['file_url']) - yield loader.load_item() + if self.settings.getbool('ENABLE_ORGANIZE_BY_ARTIST'): + yield Request(YANDERE_SHOW_POST_URL.format(id=post['id']), + callback=self.parse, cb_kwargs={'loader': loader}) + else: + yield loader.load_item() + + def parse(self, response, **kwargs): + """Parse show post url + @url https://yande.re/post/show/1056911 + @returns item 1 + @scrapes artist + """ + loader = kwargs.get('loader', YanderePostItemLoader()) + loader.selector = response + loader.add_xpath('artist', '//li[@class="tag-type-artist"]/a[last()]/text()') + yield loader.load_item() From d9680744dc6480f40c17b27cdaf41c30b4e7602c Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 21 May 2023 17:08:54 +0800 Subject: [PATCH 2/4] refactor(spiders/pixiv): using ENABLE_ORGANIZE_BY_ARTIST control "Organize file by artist" --- src/favorites_crawler/items.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/favorites_crawler/items.py b/src/favorites_crawler/items.py index 7071f60..92a970b 100644 --- a/src/favorites_crawler/items.py +++ b/src/favorites_crawler/items.py @@ -74,7 +74,8 @@ class PixivIllustItem(BaseItem): user_id: str = field(default=None) def get_folder_name(self, spider): - if not spider.crawler.settings.getbool('FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER'): + if not (spider.crawler.settings.getbool('FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER') + or spider.crawler.settings.getbool('ENABLE_ORGANIZE_BY_ARTIST')): return '' return self.user_id or 'unknown' From 54acdd51eaa28b8e3d1c6b2db24c83df372b117e Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 21 May 2023 17:09:13 +0800 Subject: [PATCH 3/4] update README.md --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9c6d74d..48c8729 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,10 @@ yandere: USERNAME: xxxx ``` -If you want save pixiv files to `pictures/a`, and want save yandere files to `pictures/b`, you can modify config file like this: +## Download location +By default, pictures will download to working directory. +If you want to change download location, you can add FILES_STORE option to config. +For example, if you want save pixiv files to `pictures/a`, and want save yandere files to `pictures/b`, you can modify config file like this: ```yaml pixiv: ACCESS_TOKEN: xxxxxxxxxxxxxxxxxxxxxxxxxxxx @@ -127,11 +130,21 @@ yandere: FILES_STORE: pictures/b ``` +## Organize file by artist if you want to organize pixiv illust by user, add this line to your config: ```yaml ... pixiv: - FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER: true # add this line to your pixiv config + # FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER: true # (Deprecation) + ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config + ... +... +``` +if you want to organize yandere post by artist, add this line to your config: +```yaml +... +yandere: + ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config ... ... ``` \ No newline at end of file From 3660a6eaa63deae59c27e16d25c961d18d5ede86 Mon Sep 17 00:00:00 2001 From: Wen Date: Sun, 21 May 2023 17:14:40 +0800 Subject: [PATCH 4/4] test: fix test --- tests/test_utils/test_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils/test_files.py b/tests/test_utils/test_files.py index 4ce7e9b..fa45f42 100644 --- a/tests/test_utils/test_files.py +++ b/tests/test_utils/test_files.py @@ -33,7 +33,7 @@ def test_cbz_archive_should_contains_page(self, comic_path): comic_archive = create_comic_archive(comic_path) with ZipFile(comic_archive) as zf: - assert zf.namelist() == ['1.jpg', '2.jpg'] + assert sorted(zf.namelist()) == ['1.jpg', '2.jpg'] def test_should_write_comment_to_archive(self, comic_path): comic_archive = create_comic_archive(comic_path, comment=b"I'm a comic.")