Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store tags(iptc/keywords) to pixiv and yandere image files. #49

Merged
merged 4 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,20 +133,25 @@ yandere:
## Organize file by artist
if you want to organize pixiv illust by user, add this line to your config:
```yaml
...
pixiv:
# FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER: true # (Deprecation)
ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config
...
...
```
if you want to organize yandere post by artist, add this line to your config:
```yaml
...
yandere:
ENABLE_ORGANIZE_BY_ARTIST: true # add this line to your yandere config
...
...
```

## Store tags to IPTC/Keywords
only support pixiv and yandere.
```yaml
yandere:
ENABLE_WRITE_IPTC_KEYWORDS: true # default: true
EXIF_TOOL_EXECUTABLE: '<Path to your exiftool executable>' # default None
pixiv:
ENABLE_WRITE_IPTC_KEYWORDS: true # default: true
EXIF_TOOL_EXECUTABLE: '<Path to your exiftool executable>' # default None
```

# Restore your favorites
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ requests==2.27.1
pyyaml==6.0
itemadapter==0.4.0
itemloaders==1.0.4
Twisted==22.10.0
Twisted==22.10.0
pyexiftool==0.5.6
2 changes: 1 addition & 1 deletion src/favorites_crawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def crawl(name, **kwargs):

def spider_closed(spider):
stats = spider.crawler.stats.get_stats()
if not (stats.get('item_scrapped_count', 0) + stats.get('item_dropped_count', 0)):
if not (stats.get('item_scraped_count', 0) + stats.get('item_dropped_count', 0)):
logger.warning('Your cookies or token may have expired.')


Expand Down
2 changes: 1 addition & 1 deletion src/favorites_crawler/constants/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
PIXIV_REQUEST_HEADERS = {
'APP-OS': 'ios',
'APP-OS-Version': '12.6',
'Accept-Language': 'zh-CH;zh;q=0.9;en;q=0.8',
'Accept-Language': 'en',
}
8 changes: 5 additions & 3 deletions src/favorites_crawler/itemloaders.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from itemloaders import ItemLoader
from itemloaders.processors import Join, Compose, MapCompose
from itemloaders.processors import Compose, MapCompose

from favorites_crawler import items
from favorites_crawler.processors import take_first, identity, get_nhentai_id, wrap_credits, \
original_url_from_nhentai_thumb_url, select_best_nhentai_title, clean_nhentai_title, \
get_year_from_iso_format, get_month_from_iso_format, get_series_from_title, get_volume_from_title, \
clean_parodies, get_page
clean_parodies, get_lemon_page, get_pixiv_tags, get_yandere_tags


class PixivIllustItemLoader(ItemLoader):
Expand All @@ -15,6 +15,7 @@ class PixivIllustItemLoader(ItemLoader):

file_urls_out = identity
user_id_out = Compose(take_first, str)
tags_out = get_pixiv_tags


class YanderePostItemLoader(ItemLoader):
Expand All @@ -24,6 +25,7 @@ class YanderePostItemLoader(ItemLoader):

file_urls_out = identity
artist_out = Compose(take_first, lambda s: s.strip())
tags_out = Compose(take_first, get_yandere_tags)


class NHentaiGalleryItemLoader(ItemLoader):
Expand Down Expand Up @@ -51,4 +53,4 @@ class LemonPicPostItemLoader(ItemLoader):

file_urls_out = identity
tags_out = identity
page_out = Compose(take_first, get_page)
page_out = Compose(take_first, get_lemon_page)
3 changes: 0 additions & 3 deletions src/favorites_crawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def get_folder_name(self, spider):

@dataclass
class ComicBookInfoItem:

title: str = field(default=None, metadata={'is_comic_info': True})
series: str = field(default=None, metadata={'is_comic_info': True})
publisher: str = field(default=None, metadata={'is_comic_info': True})
Expand Down Expand Up @@ -71,7 +70,6 @@ def get_comic_info(self):

@dataclass
class PixivIllustItem(BaseItem):

user_id: str = field(default=None)

def get_folder_name(self, spider):
Expand All @@ -83,7 +81,6 @@ def get_folder_name(self, spider):

@dataclass
class YanderePostItem(BaseItem):

artist: str = field(default=None)

def get_folder_name(self, spider):
Expand Down
80 changes: 65 additions & 15 deletions src/favorites_crawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,79 @@
from scrapy.pipelines.files import FilesPipeline
from itemadapter import ItemAdapter
from twisted.python.failure import Failure
from exiftool import ExifToolHelper

from favorites_crawler.utils.files import create_comic_archive


logger = logging.getLogger(__name__)


class FavoritesFilePipeline(FilesPipeline):
class BasePipeline(FilesPipeline):
def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
referer = item_dict.get('referer')
return (Request(url, headers={'referer': referer}) for url in item_dict.get(self.files_urls_field, ()))

def file_path(self, request, response=None, info=None, *, item=None):
return item.get_filepath(request.url, info.spider)

def item_completed(self, results, item, info):
for result in info.downloaded.values():
if isinstance(result, Failure):
logger.error('Error when downloading file: %s', result.value)
return super().item_completed(results, item, info)


class PicturePipeline(BasePipeline):
"""Save image and add iptc/keywords to it."""
def __init__(self, store_uri, download_func=None, settings=None):
super().__init__(store_uri, download_func=download_func, settings=settings)
self.write_iptc_keywords = settings.getbool('ENABLE_WRITE_IPTC_KEYWORDS', False)
if self.write_iptc_keywords:
try:
self.exif_tool = ExifToolHelper(executable=settings.get('EXIF_TOOL_EXECUTABLE', None))
self.exif_tool.run()
except Exception as e:
logger.error('Failed to load exiftool, consider to install it or setting EXIF_TOOL_EXECUTABLE. '
'\nException: %r', e)
self.exif_tool = None
else:
self.exif_tool = None

def close_spider(self, _):
if self.exif_tool and self.exif_tool.running:
self.exif_tool.terminate()

def item_completed(self, results, item, info):
item = super().item_completed(results, item, info)
if not self.exif_tool:
return item

for success, result in results:
if not (success and item.tags):
continue
path = item.get_filepath(result['url'], info.spider)
try:
msg = self.exif_tool.set_tags(
Path(self.store.basedir) / path,
{'Keywords': item.tags},
['-overwrite_original'],
).rstrip()
except Exception as e:
logger.error('Failed to write tags: %r to "%s", result: %r', item.tags, path, e)
else:
if msg == '1 image files updated':
info.spider.crawler.stats.inc_value('iptc_status_count/updated')
logger.debug('Success to write tags: %r to "%s", result: %s', item.tags, path, msg)
else:
logger.error('Failed to write tags: %r to "%s", result: %s', item.tags, path, msg)

return item


class ComicPipeline(BasePipeline):
"""Archive comic as cbz and add ComicBookInfo to it."""
def __init__(self, store_uri, **kwargs):
super().__init__(store_uri, **kwargs)
self.files_path = Path(store_uri).resolve()
Expand All @@ -44,17 +108,3 @@ def process_item(self, item, spider):
self.comic_comments[title] = bytes(comment, encoding='utf-8')

return super().process_item(item, spider)

def get_media_requests(self, item, info):
item_dict = ItemAdapter(item).asdict()
referer = item_dict.get('referer')
return (Request(url, headers={'referer': referer}) for url in item_dict.get(self.files_urls_field, ()))

def file_path(self, request, response=None, info=None, *, item=None):
return item.get_filepath(request.url, info.spider)

def item_completed(self, results, item, info):
for result in info.downloaded.values():
if isinstance(result, Failure):
logger.error('Error when downloading file: %s', result.value)
return super().item_completed(results, item, info)
20 changes: 19 additions & 1 deletion src/favorites_crawler/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,26 @@ def clean_parodies(parodies):
return parodies.strip()


def get_page(url):
def get_lemon_page(url):
match = re.match(r'https://www\..+html/(\d+)', url)
if not match:
return 1
return int(match.group(1))


def get_pixiv_tags(tags):
"""Return en-us tags."""
results = set()
for tag in tags:
if tag.get('name'):
results.add(tag['name'].strip().replace(' ', '_').lower())
if tag.get('translated_name'):
results.add(tag['translated_name'].strip().replace(' ', '_').lower())
return list(filter(
lambda x: re.match(r'^[ -~]+$', x), # ascii only
results,
))


def get_yandere_tags(tags):
return tags.split(' ')
6 changes: 4 additions & 2 deletions src/favorites_crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

TELNETCONSOLE_ENABLED = False

ITEM_PIPELINES = {'favorites_crawler.pipelines.FavoritesFilePipeline': 0}

FAVORS_PIXIV_ENABLE_ORGANIZE_BY_USER = False

# ExifTool settings
ENABLE_WRITE_IPTC_KEYWORDS = True
EXIF_TOOL_EXECUTABLE = None
3 changes: 3 additions & 0 deletions src/favorites_crawler/spiders/lemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class LemonSpider(BaseSpider):
callback='parse',
),
]
custom_settings = {
'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0},
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions src/favorites_crawler/spiders/nhentai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class NHentaiSpider(BaseSpider):
)
custom_settings = {
'CONCURRENT_REQUESTS': 5,
'ITEM_PIPELINES': {'favorites_crawler.pipelines.ComicPipeline': 0},
}

def __init__(self, *args, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions src/favorites_crawler/spiders/pixiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class PixivSpider(BaseSpider):
'DEFAULT_REQUEST_HEADERS': PIXIV_REQUEST_HEADERS,
# Add PixivAuthorizationMiddleware after DefaultHeadersMiddleware
'DOWNLOADER_MIDDLEWARES': {'favorites_crawler.middlewares.PixivAuthorizationMiddleware': 450},
'ITEM_PIPELINES': {'favorites_crawler.pipelines.PicturePipeline': 0},
}

def start_requests(self):
Expand Down
23 changes: 15 additions & 8 deletions src/favorites_crawler/spiders/yandere.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from favorites_crawler.constants.domains import YANDERE_DOMAIN
from favorites_crawler.itemloaders import YanderePostItemLoader
from favorites_crawler.constants.endpoints import YANDERE_LIST_POST_URL, YANDERE_SHOW_POST_URL
from favorites_crawler.utils.files import list_yandere_id
from favorites_crawler.utils.files import list_yandere_post


class YandereSpider(BaseSpider):
Expand All @@ -17,18 +17,17 @@ class YandereSpider(BaseSpider):
allowed_domains = (YANDERE_DOMAIN, )
custom_settings = {
'CONCURRENT_REQUESTS': 5,
'ITEM_PIPELINES': {'favorites_crawler.pipelines.PicturePipeline': 0},
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.limit = 100
self.params = {'page': 1, 'limit': self.limit}
self.posts = set()
self.posts = {}

def start_requests(self):
self.posts = set(list_yandere_id(pathlib.Path(self.settings.get('FILES_STORE')), include_subdir=True))
self.logger.debug(f'{len(self.posts)} posts will skip download.')

self.posts = list_yandere_post(pathlib.Path(self.settings.get('FILES_STORE')), include_subdir=True)
username = self.custom_settings.get('USERNAME')
if not username:
raise CloseSpider('Did you run "favors login yandere"?')
Expand All @@ -45,12 +44,20 @@ def parse_start_url(self, response, **kwargs):
yield Request(f'{YANDERE_LIST_POST_URL}?{urlencode(self.params)}', callback=self.parse_start_url)

for post in posts:
if str(post['id']) in self.posts:
continue
post_id = str(post['id'])
if post_id in self.posts:
path = self.posts[post_id] # type: pathlib.Path
if (path.name ==
YanderePostItemLoader.default_item_class().get_filename(post['file_url'], self)):
continue
path.unlink(missing_ok=True)

loader = YanderePostItemLoader()
loader.add_value('file_urls', post['file_url'])
loader.add_value('tags', post['tags'])

if self.settings.getbool('ENABLE_ORGANIZE_BY_ARTIST'):
yield Request(YANDERE_SHOW_POST_URL.format(id=post['id']),
yield Request(YANDERE_SHOW_POST_URL.format(id=post_id),
callback=self.parse, cb_kwargs={'loader': loader})
else:
yield loader.load_item()
Expand Down
10 changes: 4 additions & 6 deletions src/favorites_crawler/spiders/yandere_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from favorites_crawler.spiders import BaseSpider
from favorites_crawler.constants.domains import YANDERE_DOMAIN
from favorites_crawler.constants.endpoints import YANDERE_VOTE_POST_URL
from favorites_crawler.utils.files import list_yandere_id
from favorites_crawler.utils.files import list_yandere_post


class YandereVoteSpider(BaseSpider):
Expand All @@ -27,12 +27,10 @@ def __init__(self, csrf_token, cookie, score, path, *args, **kwargs):
self.path = Path(path)

def start_requests(self):
yandere_id_list = list_yandere_id(self.path)
self.crawler.stats.set_value('file_count', len(yandere_id_list))
yandere_id_set = set(yandere_id_list)
self.crawler.stats.set_value('voted/expected', len(yandere_id_set))
yandere_id_list = list(list_yandere_post(self.path).keys())
self.crawler.stats.set_value('voted/expected', len(yandere_id_list))

for i in yandere_id_set:
for i in yandere_id_list:
yield FormRequest(YANDERE_VOTE_POST_URL,
formdata={'id': str(i), 'score': str(self.score)},
cookies=self.cookies, headers=self.headers,
Expand Down
8 changes: 4 additions & 4 deletions src/favorites_crawler/utils/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ def create_comic_archive(path: Path, comment=b''):
return archive_name


def list_yandere_id(path=Path('.'), include_subdir=False, result=None):
result = [] if result is None else result
def list_yandere_post(path=Path('.'), include_subdir=False, result=None):
result = {} if result is None else result
for file_or_dir in path.iterdir():
if file_or_dir.is_file():
id_ = get_yandere_post_id(file_or_dir.name)
if id_:
result.append(id_)
result[id_] = file_or_dir
elif include_subdir:
list_yandere_id(file_or_dir, include_subdir, result)
list_yandere_post(file_or_dir, include_subdir, result)
return result
2 changes: 1 addition & 1 deletion tests/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_spider_closed_should_call_warn(mock_logger):
@pytest.mark.parametrize('item_scrapped_count,item_dropped_count', ((1, 0), (0, 1), (1, 1)))
def test_spider_closed_should_not_call_warn(mock_logger, item_scrapped_count, item_dropped_count):
mock_spider = MagicMock()
stats = {'item_scrapped_count': item_scrapped_count, 'item_dropped_count': item_dropped_count}
stats = {'item_scraped_count': item_scrapped_count, 'item_dropped_count': item_dropped_count}
mock_spider.crawler.stats.get_stats.return_value = {k: v for k, v in stats.items() if v}

spider_closed(mock_spider)
Expand Down
Loading
Loading