From e42169082d8376595b123442d06c333957a56e7e Mon Sep 17 00:00:00 2001 From: wen Date: Mon, 2 Dec 2024 15:20:10 +0800 Subject: [PATCH] refactor: add global config --- README.md | 25 ++---- src/favorites_crawler/commands/crawl.py | 11 ++- src/favorites_crawler/utils/config.py | 80 +++++++++++++---- tests/test_utils/test_config.py | 113 ++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 36 deletions(-) create mode 100644 tests/test_utils/test_config.py diff --git a/README.md b/README.md index 4927eb3..6634990 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,8 @@ You can set any [scrapy built-in settings](https://docs.scrapy.org/en/latest/top By default, file content likes this: ```yaml +global: + ENABLE_ORGANIZE_BY_ARTIST: true pixiv: ACCESS_TOKEN: xxxxxxxxxxxxxxxxxxxxxxxxxxxx REFRESH_TOKEN: xxxxxxxxxxxxxxxxxxxxxxxxxxxx @@ -150,27 +152,18 @@ yandere: ``` ## Organize file by artist -if you want to organize pixiv illust by user, add this line to your config: +if you want to organize pixiv and yandere files by artist, add this line to your config: ```yaml -pixiv: - # FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER: true # (Deprecation) - ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config -``` -if you want to organize yandere post by artist, add this line to your config: -```yaml -yandere: - ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config +global: + ENABLE_ORGANIZE_BY_ARTIST: true ``` ## Store tags to IPTC/Keywords -only support pixiv and yandere. +only support pixiv, yandere and twitter. ```yaml -yandere: - ENABLE_WRITE_IPTC_KEYWORDS: true # default: true - EXIF_TOOL_EXECUTABLE: '' # default None -pixiv: - ENABLE_WRITE_IPTC_KEYWORDS: true # default: true - EXIF_TOOL_EXECUTABLE: '' # default None +global: + ENABLE_WRITE_IPTC_KEYWORDS: true + EXIF_TOOL_EXECUTABLE: '' # default None, If the executable is not in the path, set it manually ``` # Restore your favorites diff --git a/src/favorites_crawler/commands/crawl.py b/src/favorites_crawler/commands/crawl.py index ba815be..7c3b072 100644 --- a/src/favorites_crawler/commands/crawl.py +++ b/src/favorites_crawler/commands/crawl.py @@ -8,14 +8,13 @@ from scrapy.utils.project import get_project_settings from scrapy.spiderloader import SpiderLoader -from favorites_crawler.utils.config import load_config, overwrite_settings +from favorites_crawler.utils.config import load_config, overwrite_spider_settings app = typer.Typer(help='Crawl your favorites from websites.', no_args_is_help=True) os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'favorites_crawler.settings') scrapy_settings = get_project_settings() spider_loader = SpiderLoader(scrapy_settings) -overwrite_settings(spider_loader, scrapy_settings, load_config()) @app.command('yandere') @@ -70,8 +69,14 @@ def crawl(name, **kwargs): :param name: spider name :param kwargs: kwargs passed to spider's __init__ method """ + spider = spider_loader.load(name) + overwrite_spider_settings(spider, scrapy_settings, load_config()) process = CrawlerProcess(scrapy_settings) - process.crawl(name, **kwargs) + process.crawl(spider, **kwargs) for crawler in process.crawlers: crawler.signals.connect(spider_closed, signal=signals.spider_closed) process.start() + + +if __name__ == '__main__': + crawl('pixiv') diff --git a/src/favorites_crawler/utils/config.py b/src/favorites_crawler/utils/config.py index 249398e..1af06b8 100644 --- a/src/favorites_crawler/utils/config.py +++ b/src/favorites_crawler/utils/config.py @@ -1,34 +1,80 @@ import os - import yaml -config_path = os.path.expanduser('~/.favorites_crawler') -config_file = os.path.join(config_path, 'config.yml') -if not os.path.exists(config_path): - os.mkdir(config_path) +DEFAULT_FAVORS_HOME = os.path.expanduser('~/.favorites_crawler') +DEFAULT_CONFIG = { + 'global': { + 'ENABLE_ORGANIZE_BY_ARTIST': True, + 'ENABLE_WRITE_IPTC_KEYWORDS': True, + 'EXIF_TOOL_EXECUTABLE': None, + }, + 'pixiv': { + 'FILES_STORE': 'favorites_crawler_files/pixiv', + 'USER_ID': '', + 'ACCESS_TOKEN': '', + 'REFRESH_TOKEN': '', + }, + 'yandere': { + 'FILES_STORE': 'favorites_crawler_files/yandere', + 'USERNAME': '', + }, + 'twitter': { + 'FILES_STORE': 'favorites_crawler_files/twitter', + 'USER_ID': '', + 'AUTHORIZATION': '', + 'LIKES_ID': '', + 'X_CSRF_TOKEN': '', + }, + 'lemon': { + 'FILES_STORE': 'favorites_crawler_files/lemon', + }, + 'nhentai': { + 'FILES_STORE': 'favorites_crawler_files/nhentai', + } +} -def load_config(): +def load_config(home: str = DEFAULT_FAVORS_HOME) -> dict: """Load config from user home""" + create_favors_home(home) + config_file = os.path.join(home, 'config.yml') if not os.path.exists(config_file): - return {} + dump_config(DEFAULT_CONFIG, home) + return DEFAULT_CONFIG with open(config_file, encoding='utf8') as f: return yaml.safe_load(f) -def dump_config(data): +def dump_config(data: dict, home: str = DEFAULT_FAVORS_HOME): """Dump config data to user home""" + create_favors_home(home) + config_file = os.path.join(home, 'config.yml') with open(config_file, 'w', encoding='utf8') as f: yaml.safe_dump(data, f, allow_unicode=True) -def overwrite_settings(spider_loader, settings, user_config): - spider_names = spider_loader.list() - for name in spider_names: - cls = spider_loader.load(name) - spider_config = user_config.get(cls.name, {}) - if spider_config: - cls.custom_settings.update(spider_config) +def create_favors_home(path: str): + """Create favors home if not exists""" + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + + +def overwrite_spider_settings(spider, default_settings, user_config): + """ + Overwrite spider settings by user config + Priority: favors spider config > favors global config > spider custom settings > scrapy settings + + :param spider: Spider class + :param default_settings: :class:`scrapy.settings.Settings` + :param user_config: favorites crawler config + """ + global_config = user_config.get('global') + if global_config: + spider.custom_settings.update(global_config) + + spider_config = user_config.get(spider.name) + if spider_config: + spider.custom_settings.update(spider_config) - default_files_store = os.path.join(settings.get('FILES_STORE', ''), cls.name) - cls.custom_settings.setdefault('FILES_STORE', default_files_store) + default_files_store = os.path.join(default_settings.get('FILES_STORE', ''), spider.name) + spider.custom_settings.setdefault('FILES_STORE', default_files_store) diff --git a/tests/test_utils/test_config.py b/tests/test_utils/test_config.py new file mode 100644 index 0000000..b32203a --- /dev/null +++ b/tests/test_utils/test_config.py @@ -0,0 +1,113 @@ +import os +import yaml + +from scrapy.utils.project import get_project_settings +from scrapy.spiderloader import SpiderLoader + +from favorites_crawler.utils.config import ( + create_favors_home, load_config, dump_config, DEFAULT_CONFIG, overwrite_spider_settings) + +scrapy_settings = get_project_settings() +spider_loader = SpiderLoader(scrapy_settings) + + +class TestCreateFavorsHome: + def test_should_create_path_when_path_exists(self, tmp_path): + test_path = tmp_path / "existing_dir" + test_path.mkdir() + + create_favors_home(str(test_path)) + + assert test_path.exists() + assert test_path.is_dir() + + def test_skip_create_path_when_path_not_exists(self, tmp_path): + test_path = tmp_path / "non_existing_dir" + + create_favors_home(str(test_path)) + + assert test_path.exists() + assert test_path.is_dir() + + +class TestLoadConfig: + def test_load_config_when_config_not_exists(self, tmp_path): + favors_home = str(tmp_path) + + config = load_config(favors_home) + + assert config == DEFAULT_CONFIG + config_file = os.path.join(favors_home, 'config.yml') + assert os.path.exists(config_file) + + with open(config_file, encoding='utf8') as f: + written_config = yaml.safe_load(f) + assert written_config == DEFAULT_CONFIG + + def test_load_config_when_config_exists(self, tmp_path): + favors_home = str(tmp_path) + config_file = os.path.join(favors_home, 'config.yml') + existing_config = {'global': {'ENABLE_ORGANIZE_BY_ARTIST': False}} + + with open(config_file, 'w', encoding='utf8') as f: + yaml.safe_dump(existing_config, f) + + config = load_config(favors_home) + + assert config == existing_config + + +class TestDumpConfig: + def test_dump_config_to_favors_home(self, tmp_path): + favors_home = str(tmp_path) + new_config = {'global': {'ENABLE_ORGANIZE_BY_ARTIST': False}} + + dump_config(new_config, favors_home) + + config_file = os.path.join(favors_home, 'config.yml') + assert os.path.exists(config_file) + + with open(config_file, encoding='utf8') as f: + written_config = yaml.safe_load(f) + assert written_config == new_config + + +class TestOverwriteSpiderSettings: + def test_overwrite_spider_settings(self): + user_config = { + 'global': { + 'ENABLE_ORGANIZE_BY_ARTIST': True, + }, + 'pixiv': { + 'FILES_STORE': '/pixiv', + } + } + spider = spider_loader.load('pixiv') + + overwrite_spider_settings(spider, scrapy_settings, user_config) + + assert spider.custom_settings['FILES_STORE'] == user_config['pixiv']['FILES_STORE'] + assert spider.custom_settings['ENABLE_ORGANIZE_BY_ARTIST'] == user_config['global']['ENABLE_ORGANIZE_BY_ARTIST'] + + def test_spider_config_priority_should_gt_global_config(self): + user_config = { + 'global': { + 'ENABLE_ORGANIZE_BY_ARTIST': True, + }, + 'yandere': { + 'ENABLE_ORGANIZE_BY_ARTIST': False, + } + } + spider = spider_loader.load('yandere') + + overwrite_spider_settings(spider, scrapy_settings, user_config) + + assert spider.custom_settings['ENABLE_ORGANIZE_BY_ARTIST'] == user_config['yandere']['ENABLE_ORGANIZE_BY_ARTIST'] + + def test_should_set_default_file_store_when_user_doesnt_config_it(self): + user_config = {} + spider = spider_loader.load('nhentai') + + overwrite_spider_settings(spider, scrapy_settings, user_config) + + assert spider.custom_settings['FILES_STORE'] == os.path.join(scrapy_settings.get('FILES_STORE', ''), 'nhentai')