From 19f346269864f2f19710724e65a4829be5e56aa3 Mon Sep 17 00:00:00 2001 From: Wen Liang Date: Fri, 6 Dec 2024 16:45:51 +0800 Subject: [PATCH] fix: fix load_cookie --- requirements.txt | 3 +- src/favorites_crawler/commands/crawl.py | 14 +++++++-- src/favorites_crawler/spiders/lemon.py | 5 ---- src/favorites_crawler/spiders/nhentai.py | 5 ---- src/favorites_crawler/spiders/twitter.py | 2 -- src/favorites_crawler/utils/cookies.py | 17 +++++++---- tests/test_utils/test_cookies.py | 36 ++++++++++++++++++++++++ 7 files changed, 61 insertions(+), 21 deletions(-) create mode 100644 tests/test_utils/test_cookies.py diff --git a/requirements.txt b/requirements.txt index 7f8b970..2433cf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ unidecode==1.3.8 langdetect==1.0.9 pykakasi==2.2.1 gppt==4.1.0 -typer>=0.14.0 \ No newline at end of file +typer>=0.14.0 +loguru>=0.7.2 \ No newline at end of file diff --git a/src/favorites_crawler/commands/crawl.py b/src/favorites_crawler/commands/crawl.py index bf16250..a50b949 100644 --- a/src/favorites_crawler/commands/crawl.py +++ b/src/favorites_crawler/commands/crawl.py @@ -8,9 +8,11 @@ from scrapy.utils.project import get_project_settings from scrapy.spiderloader import SpiderLoader +from favorites_crawler.constants.domains import LMMPIC_DOMAIN, NHENTAI_DOMAIN, TWITTER_DOMAIN from favorites_crawler.utils.config import load_config, overwrite_spider_settings from favorites_crawler.constants.path import DEFAULT_FAVORS_HOME from favorites_crawler.utils.auth import refresh_pixiv_token +from favorites_crawler.utils.cookies import load_cookie app = typer.Typer(help='Crawl your favorites from websites.', no_args_is_help=True) @@ -36,20 +38,26 @@ def crawl_pixiv(): @app.command('nhentai') def crawl_nhentai(): """Crawl your favorite comics from nhentai.""" - crawl('nhentai') + favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)) + cookies = load_cookie(NHENTAI_DOMAIN, favors_home) + crawl('nhentai', cookies=cookies) @app.command('x') @app.command('twitter') def crawl_twitter(): """Crawl your favorite pictures from twitter.""" - crawl('twitter') + favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)) + cookies = load_cookie(TWITTER_DOMAIN, favors_home) + crawl('twitter', cookies=cookies) @app.command('lemon') def crawl_lemon(id_list: list[str] = typer.Option([], '--id', '-i')): """Crawl your favorite photo albums from lemon.""" - crawl('lemon', id_list=id_list) + favors_home = os.path.expanduser(os.getenv('FAVORS_HOME', DEFAULT_FAVORS_HOME)) + cookies = load_cookie(LMMPIC_DOMAIN, favors_home) + crawl('lemon', id_list=id_list, cookies=cookies) def spider_closed(spider): diff --git a/src/favorites_crawler/spiders/lemon.py b/src/favorites_crawler/spiders/lemon.py index 4fc1923..daa897a 100644 --- a/src/favorites_crawler/spiders/lemon.py +++ b/src/favorites_crawler/spiders/lemon.py @@ -5,7 +5,6 @@ from favorites_crawler.itemloaders import LemonPicPostItemLoader from favorites_crawler.constants.endpoints import LEMON_PIC_USER_CENTER_URL, LEMON_PIC_POST_URL_PATTERN from favorites_crawler.constants.domains import LMMPIC_DOMAIN -from favorites_crawler.utils.cookies import load_cookie class LemonSpider(BaseSpider): @@ -25,10 +24,6 @@ class LemonSpider(BaseSpider): 'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0}, } - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.cookies = load_cookie(LMMPIC_DOMAIN) - def start_requests(self): if hasattr(self, 'id_list') and self.id_list: self.logger.debug('GET id_list: %s', self.id_list) diff --git a/src/favorites_crawler/spiders/nhentai.py b/src/favorites_crawler/spiders/nhentai.py index ee2e181..f39d3d9 100644 --- a/src/favorites_crawler/spiders/nhentai.py +++ b/src/favorites_crawler/spiders/nhentai.py @@ -5,7 +5,6 @@ from favorites_crawler.itemloaders import NHentaiGalleryItemLoader from favorites_crawler.constants.endpoints import NHENTAI_USER_FAVORITES_URL from favorites_crawler.constants.domains import NHENTAI_DOMAIN -from favorites_crawler.utils.cookies import load_cookie class NHentaiSpider(BaseSpider): @@ -27,10 +26,6 @@ class NHentaiSpider(BaseSpider): 'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0}, } - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.cookies = load_cookie(NHENTAI_DOMAIN) - def start_requests(self): yield Request(NHENTAI_USER_FAVORITES_URL, cookies=self.cookies) diff --git a/src/favorites_crawler/spiders/twitter.py b/src/favorites_crawler/spiders/twitter.py index 9516879..749036e 100644 --- a/src/favorites_crawler/spiders/twitter.py +++ b/src/favorites_crawler/spiders/twitter.py @@ -7,7 +7,6 @@ from favorites_crawler.itemloaders import TwitterTweetItemLoader from favorites_crawler.constants.domains import TWITTER_DOMAIN from favorites_crawler.constants.endpoints import TWITTER_LIKES_URL -from favorites_crawler.utils.cookies import load_cookie from favorites_crawler.utils.common import DictRouter @@ -31,7 +30,6 @@ def current_url(self): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.cookies = load_cookie(TWITTER_DOMAIN) self.base_url = TWITTER_LIKES_URL.format(id=self.custom_settings.get('LIKES_ID')) self.variables = { "userId": str(self.custom_settings.get('USER_ID')), diff --git a/src/favorites_crawler/utils/cookies.py b/src/favorites_crawler/utils/cookies.py index 2ca5ed6..640a49f 100644 --- a/src/favorites_crawler/utils/cookies.py +++ b/src/favorites_crawler/utils/cookies.py @@ -1,12 +1,19 @@ +from __future__ import annotations + import os +from pathlib import Path from http.cookiejar import MozillaCookieJar - -cookie_home = os.path.expanduser('~/.favorites_crawler') +from loguru import logger -def load_cookie(domain): +def load_cookie(domain: str, home: str | Path) -> dict: """Load 'Netscape HTTP Cookie File' as dict""" - cookiejar = MozillaCookieJar() - cookiejar.load(os.path.join(cookie_home, f'{domain}_cookies.txt')) + try: + cookiejar = MozillaCookieJar() + cookie_file = os.path.join(home, f'{domain}_cookies.txt') + cookiejar.load(cookie_file) + except Exception as e: + logger.error('Failed to load cookie {}, {!r}', cookie_file, e) + return {} return {getattr(c, 'name'): getattr(c, 'value') for c in cookiejar} diff --git a/tests/test_utils/test_cookies.py b/tests/test_utils/test_cookies.py new file mode 100644 index 0000000..fc1ffc9 --- /dev/null +++ b/tests/test_utils/test_cookies.py @@ -0,0 +1,36 @@ +from favorites_crawler.utils.cookies import load_cookie + + +class TestLoadCookie: + def test_load_cookie_when_file_exists(self, tmp_path): + domain = 'localhost' + cookie_file = tmp_path / f'{domain}_cookies.txt' + cookie_file.touch() + cookie_file.write_text( + """# Netscape HTTP Cookie File + # http://curl.haxx.se/rfc/cookie_spec.html + # This is a generated file! Do not edit. + + localhost FALSE / TRUE 9933144989 User-Agent Test + """ + ) + + cookie = load_cookie(domain, tmp_path) + + assert cookie == {'User-Agent': 'Test'} + + def test_load_cookie_when_file_not_exists(self, tmp_path): + domain = 'localhost' + + cookie = load_cookie(domain, tmp_path) + + assert cookie == {} + + def test_load_cookie_when_file_invalid(self, tmp_path): + domain = 'localhost' + cookie_file = tmp_path / f'{domain}_cookies.txt' + cookie_file.touch() + cookie_file.write_text('') + cookie = load_cookie(domain, tmp_path) + + assert cookie == {}